upload pdfs
This commit is contained in:
parent
03a91e755e
commit
e1ffaada2b
1 changed files with 34 additions and 8 deletions
|
@ -92,21 +92,47 @@ def upload_chunks(api, url, filename, data=None):
|
||||||
def upload_document(pdf):
|
def upload_document(pdf):
|
||||||
document_id = upload_chunks(api, url, pdf)
|
document_id = upload_chunks(api, url, pdf)
|
||||||
|
|
||||||
|
def pdfinfo(pdf):
|
||||||
|
cmd = ['pdfinfo', pdf]
|
||||||
|
try:
|
||||||
|
r = subprocess.check_output(cmd).decode().strip().split('\n')
|
||||||
|
except:
|
||||||
|
return {}
|
||||||
|
info = [
|
||||||
|
re.split(':\s+', l.strip(), 1)
|
||||||
|
for l in r
|
||||||
|
if l.strip() and not l.strip().endswith(':')
|
||||||
|
]
|
||||||
|
return dict(info)
|
||||||
|
|
||||||
def import_url(url):
|
def import_url(url):
|
||||||
meta = {
|
meta = {
|
||||||
'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
|
'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
|
||||||
'title': 'Untitled'
|
'title': 'Untitled'
|
||||||
}
|
}
|
||||||
data = ox.cache.read_url(url, unicode=True)
|
upload = False
|
||||||
title = re.compile('<title>(.*?)</title>').findall(data)
|
|
||||||
if title:
|
|
||||||
meta['title'] = title[0]
|
|
||||||
author = re.compile('<meta name="author" content="(.*?)"').findall(data)
|
|
||||||
if author:
|
|
||||||
meta['author'] = author
|
|
||||||
fd, pdf = tempfile.mkstemp('.pdf')
|
fd, pdf = tempfile.mkstemp('.pdf')
|
||||||
if url2pdf(url, pdf):
|
if url.endswith('.pdf'):
|
||||||
|
data = ox.cache.read_url(url)
|
||||||
|
with open(pdf, 'wb') as fd:
|
||||||
|
fd.write(data)
|
||||||
|
upload = True
|
||||||
|
info = pdfinfo(pdf)
|
||||||
|
for key in ('Title', 'Author'):
|
||||||
|
if key in info:
|
||||||
|
meta[key.lower()] = info[key]
|
||||||
|
else:
|
||||||
|
data = ox.cache.read_url(url, unicode=True)
|
||||||
|
title = re.compile('<title>(.*?)</title>').findall(data)
|
||||||
|
if title:
|
||||||
|
meta['title'] = title[0]
|
||||||
|
author = re.compile('<meta name="author" content="(.*?)"').findall(data)
|
||||||
|
if author:
|
||||||
|
meta['author'] = author
|
||||||
|
if url2pdf(url, pdf):
|
||||||
|
upload = True
|
||||||
|
|
||||||
|
if upload:
|
||||||
url = api.url + 'upload/document/'
|
url = api.url + 'upload/document/'
|
||||||
did = upload_chunks(api, url, pdf, {
|
did = upload_chunks(api, url, pdf, {
|
||||||
'filename': meta['title'] + '.pdf'
|
'filename': meta['title'] + '.pdf'
|
||||||
|
|
Loading…
Reference in a new issue