pandora_amp/add_website_as_pdf.py

#!/usr/bin/python3
import os
import re
import subprocess
import sys
import tempfile
import time
from urllib.parse import urlparse
from datetime import datetime

import ox
import ox.api
import ox.cache

CHUNK_SIZE = 1024*1024*5

api = ox.api.signin('https://amp.0x2620.org/api/')

def url2pdf(url, pdf):
    cmd = ['chromium-browser', '--headless', '--disable-gpu', '--print-to-pdf=' + pdf, url]
    cmd += ['--timeout=10000']
    return subprocess.call(cmd) == 0

def upload_chunks(api, url, filename, data=None):
    form = ox.MultiPartForm()
    if data:
        for key in data:
            form.add_field(key, data[key])
    data = api._json_request(url, form)

    def full_url(path):
        if path.startswith('/'):
            u = urlparse(url)
            path = '%s://%s%s' % (u.scheme, u.netloc, path)
        return path

    if 'uploadUrl' in data:
        uploadUrl = full_url(data['uploadUrl'])
        f = open(filename, 'rb')
        fsize = os.stat(filename).st_size
        done = 0
        start = time.mktime(time.localtime())
        if 'offset' in data and data['offset'] < fsize:
            done = data['offset']
            f.seek(done)
            resume_offset = done
        else:
            resume_offset = 0
        chunk = f.read(CHUNK_SIZE)
        fname = os.path.basename(filename)
        if not isinstance(fname, bytes):
            fname = fname.encode('utf-8')
        while chunk:
            sys.stdout.flush()
            form = ox.MultiPartForm()
            form.add_file('chunk', fname, chunk)
            if len(chunk) < CHUNK_SIZE or f.tell() == fsize:
                form.add_field('done', '1')
            form.add_field('offset', str(done))
            try:
                data = api._json_request(uploadUrl, form)
            except KeyboardInterrupt:
                print("\ninterrupted by user.")
                sys.exit(1)
            except:
                print("uploading chunk failed, will try again in 5 seconds\r", end='')
                sys.stdout.flush()
                data = {'result': -1}
                time.sleep(5)
            if data and 'status' in data:
                if data['status']['code'] == 403:
                    print("login required")
                    return False
                if data['status']['code'] != 200:
                    print("request returned error, will try again in 5 seconds")
                    if DEBUG:
                        print(data)
                    time.sleep(5)
            if data and data.get('result') == 1:
                done += len(chunk)
                if data.get('offset') not in (None, done):
                    print('server offset out of sync, continue from', data['offset'])
                    done = data['offset']
                    f.seek(done)
                chunk = f.read(CHUNK_SIZE)
        if data and 'result' in data and data.get('result') == 1:
            return data.get('id', True)
        else:
            return False
    return False

def upload_document(pdf):
    document_id = upload_chunks(api, url, pdf)

def pdfinfo(pdf):
    cmd = ['pdfinfo', pdf]
    try:
        r = subprocess.check_output(cmd).decode().strip().split('\n')
    except:
        return {}
    info = [
        re.split(':\s+', l.strip(), 1)
        for l in r
        if l.strip() and not l.strip().endswith(':')
    ]
    return dict(info)

def import_url(url):
    meta = {
        'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
        'title': 'Untitled'
    }
    upload = False
    fd, pdf = tempfile.mkstemp('.pdf')
    if url.endswith('.pdf'):
        data = ox.cache.read_url(url)
        with open(pdf, 'wb') as fd:
            fd.write(data)
        upload = True
        info = pdfinfo(pdf)
        for key in ('Title', 'Author'):
            if key in info:
                meta[key.lower()] = info[key]
    else:
        data = ox.cache.read_url(url, unicode=True)
        title = re.compile('<title>(.*?)</title>').findall(data)
        if title:
            meta['title'] = title[0]
        author = re.compile('<meta name="author" content="(.*?)"').findall(data)
        if author:
            meta['author'] = author
        if url2pdf(url, pdf):
            upload = True

    if upload:
        url = api.url + 'upload/document/'
        did = upload_chunks(api, url, pdf, {
            'filename': meta['title'] + '.pdf'
        })
        meta['id'] = did
        r = api.editDocument(meta)
        os.unlink(pdf)


if __name__ == '__main__':
    import sys
    url = sys.argv[1]
    import_url(url)
local app 2018-11-05 19:45:42 +00:00			`#!/usr/bin/python3`
			`import os`
			`import re`
			`import subprocess`
			`import sys`
			`import tempfile`
			`import time`
			`from urllib.parse import urlparse`
			`from datetime import datetime`

			`import ox`
			`import ox.api`
			`import ox.cache`

			`CHUNK_SIZE = 102410245`

			`api = ox.api.signin('https://amp.0x2620.org/api/')`

			`def url2pdf(url, pdf):`
			`cmd = ['chromium-browser', '--headless', '--disable-gpu', '--print-to-pdf=' + pdf, url]`
			`cmd += ['--timeout=10000']`
			`return subprocess.call(cmd) == 0`

			`def upload_chunks(api, url, filename, data=None):`
			`form = ox.MultiPartForm()`
			`if data:`
			`for key in data:`
			`form.add_field(key, data[key])`
			`data = api._json_request(url, form)`

			`def full_url(path):`
			`if path.startswith('/'):`
			`u = urlparse(url)`
			`path = '%s://%s%s' % (u.scheme, u.netloc, path)`
			`return path`

			`if 'uploadUrl' in data:`
			`uploadUrl = full_url(data['uploadUrl'])`
			`f = open(filename, 'rb')`
			`fsize = os.stat(filename).st_size`
			`done = 0`
			`start = time.mktime(time.localtime())`
			`if 'offset' in data and data['offset'] < fsize:`
			`done = data['offset']`
			`f.seek(done)`
			`resume_offset = done`
			`else:`
			`resume_offset = 0`
			`chunk = f.read(CHUNK_SIZE)`
			`fname = os.path.basename(filename)`
			`if not isinstance(fname, bytes):`
			`fname = fname.encode('utf-8')`
			`while chunk:`
			`sys.stdout.flush()`
			`form = ox.MultiPartForm()`
			`form.add_file('chunk', fname, chunk)`
			`if len(chunk) < CHUNK_SIZE or f.tell() == fsize:`
			`form.add_field('done', '1')`
			`form.add_field('offset', str(done))`
			`try:`
			`data = api._json_request(uploadUrl, form)`
			`except KeyboardInterrupt:`
			`print("\ninterrupted by user.")`
			`sys.exit(1)`
			`except:`
			`print("uploading chunk failed, will try again in 5 seconds\r", end='')`
			`sys.stdout.flush()`
			`data = {'result': -1}`
			`time.sleep(5)`
			`if data and 'status' in data:`
			`if data['status']['code'] == 403:`
			`print("login required")`
			`return False`
			`if data['status']['code'] != 200:`
			`print("request returned error, will try again in 5 seconds")`
			`if DEBUG:`
			`print(data)`
			`time.sleep(5)`
			`if data and data.get('result') == 1:`
			`done += len(chunk)`
			`if data.get('offset') not in (None, done):`
			`print('server offset out of sync, continue from', data['offset'])`
			`done = data['offset']`
			`f.seek(done)`
			`chunk = f.read(CHUNK_SIZE)`
			`if data and 'result' in data and data.get('result') == 1:`
			`return data.get('id', True)`
			`else:`
			`return False`
			`return False`

			`def upload_document(pdf):`
			`document_id = upload_chunks(api, url, pdf)`

upload pdfs 2018-11-16 13:30:12 +00:00			`def pdfinfo(pdf):`
			`cmd = ['pdfinfo', pdf]`
			`try:`
			`r = subprocess.check_output(cmd).decode().strip().split('\n')`
			`except:`
			`return {}`
			`info = [`
			`re.split(':\s+', l.strip(), 1)`
			`for l in r`
			`if l.strip() and not l.strip().endswith(':')`
			`]`
			`return dict(info)`
local app 2018-11-05 19:45:42 +00:00
			`def import_url(url):`
			`meta = {`
			`'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),`
			`'title': 'Untitled'`
			`}`
upload pdfs 2018-11-16 13:30:12 +00:00			`upload = False`
local app 2018-11-05 19:45:42 +00:00			`fd, pdf = tempfile.mkstemp('.pdf')`
upload pdfs 2018-11-16 13:30:12 +00:00			`if url.endswith('.pdf'):`
			`data = ox.cache.read_url(url)`
			`with open(pdf, 'wb') as fd:`
			`fd.write(data)`
			`upload = True`
			`info = pdfinfo(pdf)`
			`for key in ('Title', 'Author'):`
			`if key in info:`
			`meta[key.lower()] = info[key]`
			`else:`
			`data = ox.cache.read_url(url, unicode=True)`
			`title = re.compile('<title>(.*?)</title>').findall(data)`
			`if title:`
			`meta['title'] = title[0]`
			`author = re.compile('<meta name="author" content="(.*?)"').findall(data)`
			`if author:`
			`meta['author'] = author`
			`if url2pdf(url, pdf):`
			`upload = True`

			`if upload:`
local app 2018-11-05 19:45:42 +00:00			`url = api.url + 'upload/document/'`
			`did = upload_chunks(api, url, pdf, {`
			`'filename': meta['title'] + '.pdf'`
			`})`
			`meta['id'] = did`
			`r = api.editDocument(meta)`
			`os.unlink(pdf)`


			`if __name__ == '__main__':`
			`import sys`
			`url = sys.argv[1]`
			`import_url(url)`