pandora_amp/add_website_as_pdf.py

#!/usr/bin/python3
import os
import re
import subprocess
import sys
import tempfile
import time
from urllib.parse import urlparse
from datetime import datetime

import ox
import ox.api
import ox.cache

CHUNK_SIZE = 1024*1024*5

api = ox.api.signin('https://amp.0x2620.org/api/')

def url2pdf(url, pdf):
    cmd = ['chromium-browser', '--headless', '--disable-gpu', '--print-to-pdf=' + pdf, url]
    cmd += ['--timeout=10000']
    return subprocess.call(cmd) == 0

def upload_chunks(api, url, filename, data=None):
    form = ox.MultiPartForm()
    if data:
        for key in data:
            form.add_field(key, data[key])
    data = api._json_request(url, form)

    def full_url(path):
        if path.startswith('/'):
            u = urlparse(url)
            path = '%s://%s%s' % (u.scheme, u.netloc, path)
        return path

    if 'uploadUrl' in data:
        uploadUrl = full_url(data['uploadUrl'])
        f = open(filename, 'rb')
        fsize = os.stat(filename).st_size
        done = 0
        start = time.mktime(time.localtime())
        if 'offset' in data and data['offset'] < fsize:
            done = data['offset']
            f.seek(done)
            resume_offset = done
        else:
            resume_offset = 0
        chunk = f.read(CHUNK_SIZE)
        fname = os.path.basename(filename)
        if not isinstance(fname, bytes):
            fname = fname.encode('utf-8')
        while chunk:
            sys.stdout.flush()
            form = ox.MultiPartForm()
            form.add_file('chunk', fname, chunk)
            if len(chunk) < CHUNK_SIZE or f.tell() == fsize:
                form.add_field('done', '1')
            form.add_field('offset', str(done))
            try:
                data = api._json_request(uploadUrl, form)
            except KeyboardInterrupt:
                print("\ninterrupted by user.")
                sys.exit(1)
            except:
                print("uploading chunk failed, will try again in 5 seconds\r", end='')
                sys.stdout.flush()
                data = {'result': -1}
                time.sleep(5)
            if data and 'status' in data:
                if data['status']['code'] == 403:
                    print("login required")
                    return False
                if data['status']['code'] != 200:
                    print("request returned error, will try again in 5 seconds")
                    if DEBUG:
                        print(data)
                    time.sleep(5)
            if data and data.get('result') == 1:
                done += len(chunk)
                if data.get('offset') not in (None, done):
                    print('server offset out of sync, continue from', data['offset'])
                    done = data['offset']
                    f.seek(done)
                chunk = f.read(CHUNK_SIZE)
        if data and 'result' in data and data.get('result') == 1:
            return data.get('id', True)
        else:
            return False
    return False

def upload_document(pdf):
    document_id = upload_chunks(api, url, pdf)

def pdfinfo(pdf):
    cmd = ['pdfinfo', pdf]
    try:
        r = subprocess.check_output(cmd).decode().strip().split('\n')
    except:
        return {}
    info = [
        re.split(':\s+', l.strip(), 1)
        for l in r
        if l.strip() and not l.strip().endswith(':')
    ]
    return dict(info)

def import_url(url):
    meta = {
        'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
        'title': 'Untitled'
    }
    upload = False
    fd, pdf = tempfile.mkstemp('.pdf')
    if url.endswith('.pdf'):
        data = ox.cache.read_url(url)
        with open(pdf, 'wb') as fd:
            fd.write(data)
        upload = True
        info = pdfinfo(pdf)
        for key in ('Title', 'Author'):
            if key in info:
                meta[key.lower()] = info[key]
    else:
        data = ox.cache.read_url(url, unicode=True)
        title = re.compile('<title>(.*?)</title>').findall(data)
        if title:
            meta['title'] = title[0]
        author = re.compile('<meta name="author" content="(.*?)"').findall(data)
        if author:
            meta['author'] = author
        if url2pdf(url, pdf):
            upload = True

    if upload:
        url = api.url + 'upload/document/'
        did = upload_chunks(api, url, pdf, {
            'filename': meta['title'] + '.pdf'
        })
        meta['id'] = did
        r = api.editDocument(meta)
        os.unlink(pdf)


if __name__ == '__main__':
    import sys
    url = sys.argv[1]
    import_url(url)