148 lines
4.5 KiB
Python
148 lines
4.5 KiB
Python
#!/usr/bin/python3
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
from urllib.parse import urlparse
|
|
from datetime import datetime
|
|
|
|
import ox
|
|
import ox.api
|
|
import ox.cache
|
|
|
|
CHUNK_SIZE = 1024*1024*5
|
|
|
|
api = ox.api.signin('https://amp.0x2620.org/api/')
|
|
|
|
def url2pdf(url, pdf):
|
|
cmd = ['chromium-browser', '--headless', '--disable-gpu', '--print-to-pdf=' + pdf, url]
|
|
cmd += ['--timeout=10000']
|
|
return subprocess.call(cmd) == 0
|
|
|
|
def upload_chunks(api, url, filename, data=None):
|
|
form = ox.MultiPartForm()
|
|
if data:
|
|
for key in data:
|
|
form.add_field(key, data[key])
|
|
data = api._json_request(url, form)
|
|
|
|
def full_url(path):
|
|
if path.startswith('/'):
|
|
u = urlparse(url)
|
|
path = '%s://%s%s' % (u.scheme, u.netloc, path)
|
|
return path
|
|
|
|
if 'uploadUrl' in data:
|
|
uploadUrl = full_url(data['uploadUrl'])
|
|
f = open(filename, 'rb')
|
|
fsize = os.stat(filename).st_size
|
|
done = 0
|
|
start = time.mktime(time.localtime())
|
|
if 'offset' in data and data['offset'] < fsize:
|
|
done = data['offset']
|
|
f.seek(done)
|
|
resume_offset = done
|
|
else:
|
|
resume_offset = 0
|
|
chunk = f.read(CHUNK_SIZE)
|
|
fname = os.path.basename(filename)
|
|
if not isinstance(fname, bytes):
|
|
fname = fname.encode('utf-8')
|
|
while chunk:
|
|
sys.stdout.flush()
|
|
form = ox.MultiPartForm()
|
|
form.add_file('chunk', fname, chunk)
|
|
if len(chunk) < CHUNK_SIZE or f.tell() == fsize:
|
|
form.add_field('done', '1')
|
|
form.add_field('offset', str(done))
|
|
try:
|
|
data = api._json_request(uploadUrl, form)
|
|
except KeyboardInterrupt:
|
|
print("\ninterrupted by user.")
|
|
sys.exit(1)
|
|
except:
|
|
print("uploading chunk failed, will try again in 5 seconds\r", end='')
|
|
sys.stdout.flush()
|
|
data = {'result': -1}
|
|
time.sleep(5)
|
|
if data and 'status' in data:
|
|
if data['status']['code'] == 403:
|
|
print("login required")
|
|
return False
|
|
if data['status']['code'] != 200:
|
|
print("request returned error, will try again in 5 seconds")
|
|
if DEBUG:
|
|
print(data)
|
|
time.sleep(5)
|
|
if data and data.get('result') == 1:
|
|
done += len(chunk)
|
|
if data.get('offset') not in (None, done):
|
|
print('server offset out of sync, continue from', data['offset'])
|
|
done = data['offset']
|
|
f.seek(done)
|
|
chunk = f.read(CHUNK_SIZE)
|
|
if data and 'result' in data and data.get('result') == 1:
|
|
return data.get('id', True)
|
|
else:
|
|
return False
|
|
return False
|
|
|
|
def upload_document(pdf):
|
|
document_id = upload_chunks(api, url, pdf)
|
|
|
|
def pdfinfo(pdf):
|
|
cmd = ['pdfinfo', pdf]
|
|
try:
|
|
r = subprocess.check_output(cmd).decode().strip().split('\n')
|
|
except:
|
|
return {}
|
|
info = [
|
|
re.split(':\s+', l.strip(), 1)
|
|
for l in r
|
|
if l.strip() and not l.strip().endswith(':')
|
|
]
|
|
return dict(info)
|
|
|
|
def import_url(url):
|
|
meta = {
|
|
'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
|
|
'title': 'Untitled'
|
|
}
|
|
upload = False
|
|
fd, pdf = tempfile.mkstemp('.pdf')
|
|
if url.endswith('.pdf'):
|
|
data = ox.cache.read_url(url)
|
|
with open(pdf, 'wb') as fd:
|
|
fd.write(data)
|
|
upload = True
|
|
info = pdfinfo(pdf)
|
|
for key in ('Title', 'Author'):
|
|
if key in info:
|
|
meta[key.lower()] = info[key]
|
|
else:
|
|
data = ox.cache.read_url(url, unicode=True)
|
|
title = re.compile('<title>(.*?)</title>').findall(data)
|
|
if title:
|
|
meta['title'] = title[0]
|
|
author = re.compile('<meta name="author" content="(.*?)"').findall(data)
|
|
if author:
|
|
meta['author'] = author
|
|
if url2pdf(url, pdf):
|
|
upload = True
|
|
|
|
if upload:
|
|
url = api.url + 'upload/document/'
|
|
did = upload_chunks(api, url, pdf, {
|
|
'filename': meta['title'] + '.pdf'
|
|
})
|
|
meta['id'] = did
|
|
r = api.editDocument(meta)
|
|
os.unlink(pdf)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
url = sys.argv[1]
|
|
import_url(url)
|