upload pdfs

2018-11-16 13:30:12 +00:00 · 2018-11-16 13:30:12 +00:00 · e1ffaada2b
commit e1ffaada2b
parent 03a91e755e
1 changed files with 34 additions and 8 deletions
--- a/add_website_as_pdf.py
+++ b/add_website_as_pdf.py
@ -92,21 +92,47 @@ def upload_chunks(api, url, filename, data=None):
 def upload_document(pdf):
    document_id = upload_chunks(api, url, pdf)

+def pdfinfo(pdf):
+    cmd = ['pdfinfo', pdf]
+    try:
+        r = subprocess.check_output(cmd).decode().strip().split('\n')
+    except:
+        return {}
+    info = [
+        re.split(':\s+', l.strip(), 1)
+        for l in r
+        if l.strip() and not l.strip().endswith(':')
+    ]
+    return dict(info)

 def import_url(url):
    meta = {
        'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
        'title': 'Untitled'
    }
-    data = ox.cache.read_url(url, unicode=True)
-    title = re.compile('<title>(.*?)</title>').findall(data)
-    if title:
-        meta['title'] = title[0]
-    author = re.compile('<meta name="author" content="(.*?)"').findall(data)
-    if author:
-        meta['author'] = author
+    upload = False
    fd, pdf = tempfile.mkstemp('.pdf')
-    if url2pdf(url, pdf):
+    if url.endswith('.pdf'):
+        data = ox.cache.read_url(url)
+        with open(pdf, 'wb') as fd:
+            fd.write(data)
+        upload = True
+        info = pdfinfo(pdf)
+        for key in ('Title', 'Author'):
+            if key in info:
+                meta[key.lower()] = info[key]
+    else:
+        data = ox.cache.read_url(url, unicode=True)
+        title = re.compile('<title>(.*?)</title>').findall(data)
+        if title:
+            meta['title'] = title[0]
+        author = re.compile('<meta name="author" content="(.*?)"').findall(data)
+        if author:
+            meta['author'] = author
+        if url2pdf(url, pdf):
+            upload = True
+
+    if upload:
        url = api.url + 'upload/document/'
        did = upload_chunks(api, url, pdf, {
            'filename': meta['title'] + '.pdf'