upload pdfs
This commit is contained in:
parent
03a91e755e
commit
e1ffaada2b
1 changed files with 34 additions and 8 deletions
|
@ -92,21 +92,47 @@ def upload_chunks(api, url, filename, data=None):
|
|||
def upload_document(pdf):
|
||||
document_id = upload_chunks(api, url, pdf)
|
||||
|
||||
def pdfinfo(pdf):
|
||||
cmd = ['pdfinfo', pdf]
|
||||
try:
|
||||
r = subprocess.check_output(cmd).decode().strip().split('\n')
|
||||
except:
|
||||
return {}
|
||||
info = [
|
||||
re.split(':\s+', l.strip(), 1)
|
||||
for l in r
|
||||
if l.strip() and not l.strip().endswith(':')
|
||||
]
|
||||
return dict(info)
|
||||
|
||||
def import_url(url):
|
||||
meta = {
|
||||
'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
|
||||
'title': 'Untitled'
|
||||
}
|
||||
data = ox.cache.read_url(url, unicode=True)
|
||||
title = re.compile('<title>(.*?)</title>').findall(data)
|
||||
if title:
|
||||
meta['title'] = title[0]
|
||||
author = re.compile('<meta name="author" content="(.*?)"').findall(data)
|
||||
if author:
|
||||
meta['author'] = author
|
||||
upload = False
|
||||
fd, pdf = tempfile.mkstemp('.pdf')
|
||||
if url2pdf(url, pdf):
|
||||
if url.endswith('.pdf'):
|
||||
data = ox.cache.read_url(url)
|
||||
with open(pdf, 'wb') as fd:
|
||||
fd.write(data)
|
||||
upload = True
|
||||
info = pdfinfo(pdf)
|
||||
for key in ('Title', 'Author'):
|
||||
if key in info:
|
||||
meta[key.lower()] = info[key]
|
||||
else:
|
||||
data = ox.cache.read_url(url, unicode=True)
|
||||
title = re.compile('<title>(.*?)</title>').findall(data)
|
||||
if title:
|
||||
meta['title'] = title[0]
|
||||
author = re.compile('<meta name="author" content="(.*?)"').findall(data)
|
||||
if author:
|
||||
meta['author'] = author
|
||||
if url2pdf(url, pdf):
|
||||
upload = True
|
||||
|
||||
if upload:
|
||||
url = api.url + 'upload/document/'
|
||||
did = upload_chunks(api, url, pdf, {
|
||||
'filename': meta['title'] + '.pdf'
|
||||
|
|
Loading…
Reference in a new issue