pandora_amp/add_website_as_pdf.py

149 lines
4.5 KiB
Python
Raw Normal View History

2018-11-05 19:45:42 +00:00
#!/usr/bin/python3
import os
import re
import subprocess
import sys
import tempfile
import time
from urllib.parse import urlparse
from datetime import datetime
import ox
import ox.api
import ox.cache
CHUNK_SIZE = 1024*1024*5
api = ox.api.signin('https://amp.0x2620.org/api/')
def url2pdf(url, pdf):
cmd = ['chromium-browser', '--headless', '--disable-gpu', '--print-to-pdf=' + pdf, url]
cmd += ['--timeout=10000']
return subprocess.call(cmd) == 0
def upload_chunks(api, url, filename, data=None):
form = ox.MultiPartForm()
if data:
for key in data:
form.add_field(key, data[key])
data = api._json_request(url, form)
def full_url(path):
if path.startswith('/'):
u = urlparse(url)
path = '%s://%s%s' % (u.scheme, u.netloc, path)
return path
if 'uploadUrl' in data:
uploadUrl = full_url(data['uploadUrl'])
f = open(filename, 'rb')
fsize = os.stat(filename).st_size
done = 0
start = time.mktime(time.localtime())
if 'offset' in data and data['offset'] < fsize:
done = data['offset']
f.seek(done)
resume_offset = done
else:
resume_offset = 0
chunk = f.read(CHUNK_SIZE)
fname = os.path.basename(filename)
if not isinstance(fname, bytes):
fname = fname.encode('utf-8')
while chunk:
sys.stdout.flush()
form = ox.MultiPartForm()
form.add_file('chunk', fname, chunk)
if len(chunk) < CHUNK_SIZE or f.tell() == fsize:
form.add_field('done', '1')
form.add_field('offset', str(done))
try:
data = api._json_request(uploadUrl, form)
except KeyboardInterrupt:
print("\ninterrupted by user.")
sys.exit(1)
except:
print("uploading chunk failed, will try again in 5 seconds\r", end='')
sys.stdout.flush()
data = {'result': -1}
time.sleep(5)
if data and 'status' in data:
if data['status']['code'] == 403:
print("login required")
return False
if data['status']['code'] != 200:
print("request returned error, will try again in 5 seconds")
if DEBUG:
print(data)
time.sleep(5)
if data and data.get('result') == 1:
done += len(chunk)
if data.get('offset') not in (None, done):
print('server offset out of sync, continue from', data['offset'])
done = data['offset']
f.seek(done)
chunk = f.read(CHUNK_SIZE)
if data and 'result' in data and data.get('result') == 1:
return data.get('id', True)
else:
return False
return False
def upload_document(pdf):
document_id = upload_chunks(api, url, pdf)
2018-11-16 13:30:12 +00:00
def pdfinfo(pdf):
cmd = ['pdfinfo', pdf]
try:
r = subprocess.check_output(cmd).decode().strip().split('\n')
except:
return {}
info = [
re.split(':\s+', l.strip(), 1)
for l in r
if l.strip() and not l.strip().endswith(':')
]
return dict(info)
2018-11-05 19:45:42 +00:00
def import_url(url):
meta = {
'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
'title': 'Untitled'
}
2018-11-16 13:30:12 +00:00
upload = False
2018-11-05 19:45:42 +00:00
fd, pdf = tempfile.mkstemp('.pdf')
2018-11-16 13:30:12 +00:00
if url.endswith('.pdf'):
data = ox.cache.read_url(url)
with open(pdf, 'wb') as fd:
fd.write(data)
upload = True
info = pdfinfo(pdf)
for key in ('Title', 'Author'):
if key in info:
meta[key.lower()] = info[key]
else:
data = ox.cache.read_url(url, unicode=True)
title = re.compile('<title>(.*?)</title>').findall(data)
if title:
meta['title'] = title[0]
author = re.compile('<meta name="author" content="(.*?)"').findall(data)
if author:
meta['author'] = author
if url2pdf(url, pdf):
upload = True
if upload:
2018-11-05 19:45:42 +00:00
url = api.url + 'upload/document/'
did = upload_chunks(api, url, pdf, {
'filename': meta['title'] + '.pdf'
})
meta['id'] = did
r = api.editDocument(meta)
os.unlink(pdf)
if __name__ == '__main__':
import sys
url = sys.argv[1]
import_url(url)