diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/add_website_as_pdf.py b/add_website_as_pdf.py new file mode 100644 index 0000000..3d82f62 --- /dev/null +++ b/add_website_as_pdf.py @@ -0,0 +1,123 @@ +#!/usr/bin/python3 +import os +import re +import subprocess +import sys +import tempfile +import time +from urllib.parse import urlparse +from datetime import datetime + +import ox +import ox.api +import ox.cache + +CHUNK_SIZE = 1024*1024*5 + +api = ox.api.signin('https://amp.0x2620.org/api/') + +def url2pdf(url, pdf): + cmd = ['chromium-browser', '--headless', '--disable-gpu', '--print-to-pdf=' + pdf, url] + cmd += ['--timeout=10000'] + return subprocess.call(cmd) == 0 + +def upload_chunks(api, url, filename, data=None): + form = ox.MultiPartForm() + if data: + for key in data: + form.add_field(key, data[key]) + data = api._json_request(url, form) + + def full_url(path): + if path.startswith('/'): + u = urlparse(url) + path = '%s://%s%s' % (u.scheme, u.netloc, path) + return path + + if 'uploadUrl' in data: + uploadUrl = full_url(data['uploadUrl']) + f = open(filename, 'rb') + fsize = os.stat(filename).st_size + done = 0 + start = time.mktime(time.localtime()) + if 'offset' in data and data['offset'] < fsize: + done = data['offset'] + f.seek(done) + resume_offset = done + else: + resume_offset = 0 + chunk = f.read(CHUNK_SIZE) + fname = os.path.basename(filename) + if not isinstance(fname, bytes): + fname = fname.encode('utf-8') + while chunk: + sys.stdout.flush() + form = ox.MultiPartForm() + form.add_file('chunk', fname, chunk) + if len(chunk) < CHUNK_SIZE or f.tell() == fsize: + form.add_field('done', '1') + form.add_field('offset', str(done)) + try: + data = api._json_request(uploadUrl, form) + except KeyboardInterrupt: + print("\ninterrupted by user.") + sys.exit(1) + except: + print("uploading chunk failed, will try again in 5 seconds\r", end='') + sys.stdout.flush() + data = {'result': -1} + time.sleep(5) + if data and 'status' in data: + if data['status']['code'] == 403: + print("login required") + return False + if data['status']['code'] != 200: + print("request returned error, will try again in 5 seconds") + if DEBUG: + print(data) + time.sleep(5) + if data and data.get('result') == 1: + done += len(chunk) + if data.get('offset') not in (None, done): + print('server offset out of sync, continue from', data['offset']) + done = data['offset'] + f.seek(done) + chunk = f.read(CHUNK_SIZE) + if data and 'result' in data and data.get('result') == 1: + return data.get('id', True) + else: + return False + return False + +def upload_document(pdf): + document_id = upload_chunks(api, url, pdf) + + +def import_url(url): + meta = { + 'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')), + 'title': 'Untitled' + } + data = ox.cache.read_url(url, unicode=True) + title = re.compile('(.*?)').findall(data) + if title: + meta['title'] = title[0] + author= re.compile('