#!/usr/bin/python3 import os import re import subprocess import sys import tempfile import time from urllib.parse import urlparse from datetime import datetime import ox import ox.api import ox.cache CHUNK_SIZE = 1024*1024*5 api = ox.api.signin('https://amp.0x2620.org/api/') def url2pdf(url, pdf): cmd = ['chromium-browser', '--headless', '--disable-gpu', '--print-to-pdf=' + pdf, url] cmd += ['--timeout=10000'] return subprocess.call(cmd) == 0 def upload_chunks(api, url, filename, data=None): form = ox.MultiPartForm() if data: for key in data: form.add_field(key, data[key]) data = api._json_request(url, form) def full_url(path): if path.startswith('/'): u = urlparse(url) path = '%s://%s%s' % (u.scheme, u.netloc, path) return path if 'uploadUrl' in data: uploadUrl = full_url(data['uploadUrl']) f = open(filename, 'rb') fsize = os.stat(filename).st_size done = 0 start = time.mktime(time.localtime()) if 'offset' in data and data['offset'] < fsize: done = data['offset'] f.seek(done) resume_offset = done else: resume_offset = 0 chunk = f.read(CHUNK_SIZE) fname = os.path.basename(filename) if not isinstance(fname, bytes): fname = fname.encode('utf-8') while chunk: sys.stdout.flush() form = ox.MultiPartForm() form.add_file('chunk', fname, chunk) if len(chunk) < CHUNK_SIZE or f.tell() == fsize: form.add_field('done', '1') form.add_field('offset', str(done)) try: data = api._json_request(uploadUrl, form) except KeyboardInterrupt: print("\ninterrupted by user.") sys.exit(1) except: print("uploading chunk failed, will try again in 5 seconds\r", end='') sys.stdout.flush() data = {'result': -1} time.sleep(5) if data and 'status' in data: if data['status']['code'] == 403: print("login required") return False if data['status']['code'] != 200: print("request returned error, will try again in 5 seconds") if DEBUG: print(data) time.sleep(5) if data and data.get('result') == 1: done += len(chunk) if data.get('offset') not in (None, done): print('server offset out of sync, continue from', data['offset']) done = data['offset'] f.seek(done) chunk = f.read(CHUNK_SIZE) if data and 'result' in data and data.get('result') == 1: return data.get('id', True) else: return False return False def upload_document(pdf): document_id = upload_chunks(api, url, pdf) def pdfinfo(pdf): cmd = ['pdfinfo', pdf] try: r = subprocess.check_output(cmd).decode().strip().split('\n') except: return {} info = [ re.split(':\s+', l.strip(), 1) for l in r if l.strip() and not l.strip().endswith(':') ] return dict(info) def import_url(url): meta = { 'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')), 'title': 'Untitled' } upload = False fd, pdf = tempfile.mkstemp('.pdf') if url.endswith('.pdf'): data = ox.cache.read_url(url) with open(pdf, 'wb') as fd: fd.write(data) upload = True info = pdfinfo(pdf) for key in ('Title', 'Author'): if key in info: meta[key.lower()] = info[key] else: data = ox.cache.read_url(url, unicode=True) title = re.compile('(.*?)').findall(data) if title: meta['title'] = title[0] author = re.compile('