From e1ffaada2b1593e6e101bce6b154dc298cdd44b6 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 16 Nov 2018 13:30:12 +0000 Subject: [PATCH] upload pdfs --- add_website_as_pdf.py | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/add_website_as_pdf.py b/add_website_as_pdf.py index 1d038ba..a9e56c0 100644 --- a/add_website_as_pdf.py +++ b/add_website_as_pdf.py @@ -92,21 +92,47 @@ def upload_chunks(api, url, filename, data=None): def upload_document(pdf): document_id = upload_chunks(api, url, pdf) +def pdfinfo(pdf): + cmd = ['pdfinfo', pdf] + try: + r = subprocess.check_output(cmd).decode().strip().split('\n') + except: + return {} + info = [ + re.split(':\s+', l.strip(), 1) + for l in r + if l.strip() and not l.strip().endswith(':') + ] + return dict(info) def import_url(url): meta = { 'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')), 'title': 'Untitled' } - data = ox.cache.read_url(url, unicode=True) - title = re.compile('(.*?)').findall(data) - if title: - meta['title'] = title[0] - author = re.compile('(.*?)').findall(data) + if title: + meta['title'] = title[0] + author = re.compile('