upload pdfs

2018-11-16 13:30:12 +00:00 · 2018-11-16 13:30:12 +00:00 · e1ffaada2b
commit e1ffaada2b
parent 03a91e755e
1 changed files with 34 additions and 8 deletions
--- a/add_website_as_pdf.py
+++ b/add_website_as_pdf.py
@ -92,21 +92,47 @@ def upload_chunks(api, url, filename, data=None):
 def upload_document(pdf):
    document_id = upload_chunks(api, url, pdf)
 def pdfinfo(pdf):
    cmd = ['pdfinfo', pdf]
    try:
        r = subprocess.check_output(cmd).decode().strip().split('\n')
    except:
        return {}
    info = [
        re.split(':\s+', l.strip(), 1)
        for l in r
        if l.strip() and not l.strip().endswith(':')
    ]
    return dict(info)
 def import_url(url):
    meta = {
        'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
        'title': 'Untitled'
    }
-    data = ox.cache.read_url(url, unicode=True)
+    upload = False
    title = re.compile('<title>(.*?)</title>').findall(data)
    if title:
        meta['title'] = title[0]
    author = re.compile('<meta name="author" content="(.*?)"').findall(data)
    if author:
        meta['author'] = author
    fd, pdf = tempfile.mkstemp('.pdf')
-    if url2pdf(url, pdf):
+    if url.endswith('.pdf'):
        data = ox.cache.read_url(url)
        with open(pdf, 'wb') as fd:
            fd.write(data)
        upload = True
        info = pdfinfo(pdf)
        for key in ('Title', 'Author'):
            if key in info:
                meta[key.lower()] = info[key]
    else:
        data = ox.cache.read_url(url, unicode=True)
        title = re.compile('<title>(.*?)</title>').findall(data)
        if title:
            meta['title'] = title[0]
        author = re.compile('<meta name="author" content="(.*?)"').findall(data)
        if author:
            meta['author'] = author
        if url2pdf(url, pdf):
            upload = True
    if upload:
        url = api.url + 'upload/document/'
        did = upload_chunks(api, url, pdf, {
            'filename': meta['title'] + '.pdf'