From e1ffaada2b1593e6e101bce6b154dc298cdd44b6 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Fri, 16 Nov 2018 13:30:12 +0000
Subject: [PATCH] upload pdfs

---
 add_website_as_pdf.py | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)
diff --git a/add_website_as_pdf.py b/add_website_as_pdf.py
index 1d038ba..a9e56c0 100644
--- a/add_website_as_pdf.py
+++ b/add_website_as_pdf.py
@@ -92,21 +92,47 @@ def upload_chunks(api, url, filename, data=None):
 def upload_document(pdf):
     document_id = upload_chunks(api, url, pdf)
 
+def pdfinfo(pdf):
+    cmd = ['pdfinfo', pdf]
+    try:
+        r = subprocess.check_output(cmd).decode().strip().split('\n')
+    except:
+        return {}
+    info = [
+        re.split(':\s+', l.strip(), 1)
+        for l in r
+        if l.strip() and not l.strip().endswith(':')
+    ]
+    return dict(info)
 
 def import_url(url):
     meta = {
         'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
         'title': 'Untitled'
     }
-    data = ox.cache.read_url(url, unicode=True)
-    title = re.compile('<title>(.*?)</title>').findall(data)
-    if title:
-        meta['title'] = title[0]
-    author = re.compile('<meta name="author" content="(.*?)"').findall(data)
-    if author:
-        meta['author'] = author
+    upload = False
     fd, pdf = tempfile.mkstemp('.pdf')
-    if url2pdf(url, pdf):
+    if url.endswith('.pdf'):
+        data = ox.cache.read_url(url)
+        with open(pdf, 'wb') as fd:
+            fd.write(data)
+        upload = True
+        info = pdfinfo(pdf)
+        for key in ('Title', 'Author'):
+            if key in info:
+                meta[key.lower()] = info[key]
+    else:
+        data = ox.cache.read_url(url, unicode=True)
+        title = re.compile('<title>(.*?)</title>').findall(data)
+        if title:
+            meta['title'] = title[0]
+        author = re.compile('<meta name="author" content="(.*?)"').findall(data)
+        if author:
+            meta['author'] = author
+        if url2pdf(url, pdf):
+            upload = True
+
+    if upload:
         url = api.url + 'upload/document/'
         did = upload_chunks(api, url, pdf, {
             'filename': meta['title'] + '.pdf'