upload pdfs

This commit is contained in:
j 2018-11-16 13:30:12 +00:00
parent 03a91e755e
commit e1ffaada2b

View file

@ -92,21 +92,47 @@ def upload_chunks(api, url, filename, data=None):
def upload_document(pdf):
document_id = upload_chunks(api, url, pdf)
def pdfinfo(pdf):
cmd = ['pdfinfo', pdf]
try:
r = subprocess.check_output(cmd).decode().strip().split('\n')
except:
return {}
info = [
re.split(':\s+', l.strip(), 1)
for l in r
if l.strip() and not l.strip().endswith(':')
]
return dict(info)
def import_url(url):
meta = {
'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
'title': 'Untitled'
}
data = ox.cache.read_url(url, unicode=True)
title = re.compile('<title>(.*?)</title>').findall(data)
if title:
meta['title'] = title[0]
author = re.compile('<meta name="author" content="(.*?)"').findall(data)
if author:
meta['author'] = author
upload = False
fd, pdf = tempfile.mkstemp('.pdf')
if url2pdf(url, pdf):
if url.endswith('.pdf'):
data = ox.cache.read_url(url)
with open(pdf, 'wb') as fd:
fd.write(data)
upload = True
info = pdfinfo(pdf)
for key in ('Title', 'Author'):
if key in info:
meta[key.lower()] = info[key]
else:
data = ox.cache.read_url(url, unicode=True)
title = re.compile('<title>(.*?)</title>').findall(data)
if title:
meta['title'] = title[0]
author = re.compile('<meta name="author" content="(.*?)"').findall(data)
if author:
meta['author'] = author
if url2pdf(url, pdf):
upload = True
if upload:
url = api.url + 'upload/document/'
did = upload_chunks(api, url, pdf, {
'filename': meta['title'] + '.pdf'