local app
This commit is contained in:
parent
d255ed6a03
commit
a775ed3055
7 changed files with 314 additions and 3 deletions
123
add_website_as_pdf.py
Normal file
123
add_website_as_pdf.py
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
#!/usr/bin/python3
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
|
||||
import ox
|
||||
import ox.api
|
||||
import ox.cache
|
||||
|
||||
CHUNK_SIZE = 1024*1024*5
|
||||
|
||||
api = ox.api.signin('https://amp.0x2620.org/api/')
|
||||
|
||||
def url2pdf(url, pdf):
|
||||
cmd = ['chromium-browser', '--headless', '--disable-gpu', '--print-to-pdf=' + pdf, url]
|
||||
cmd += ['--timeout=10000']
|
||||
return subprocess.call(cmd) == 0
|
||||
|
||||
def upload_chunks(api, url, filename, data=None):
|
||||
form = ox.MultiPartForm()
|
||||
if data:
|
||||
for key in data:
|
||||
form.add_field(key, data[key])
|
||||
data = api._json_request(url, form)
|
||||
|
||||
def full_url(path):
|
||||
if path.startswith('/'):
|
||||
u = urlparse(url)
|
||||
path = '%s://%s%s' % (u.scheme, u.netloc, path)
|
||||
return path
|
||||
|
||||
if 'uploadUrl' in data:
|
||||
uploadUrl = full_url(data['uploadUrl'])
|
||||
f = open(filename, 'rb')
|
||||
fsize = os.stat(filename).st_size
|
||||
done = 0
|
||||
start = time.mktime(time.localtime())
|
||||
if 'offset' in data and data['offset'] < fsize:
|
||||
done = data['offset']
|
||||
f.seek(done)
|
||||
resume_offset = done
|
||||
else:
|
||||
resume_offset = 0
|
||||
chunk = f.read(CHUNK_SIZE)
|
||||
fname = os.path.basename(filename)
|
||||
if not isinstance(fname, bytes):
|
||||
fname = fname.encode('utf-8')
|
||||
while chunk:
|
||||
sys.stdout.flush()
|
||||
form = ox.MultiPartForm()
|
||||
form.add_file('chunk', fname, chunk)
|
||||
if len(chunk) < CHUNK_SIZE or f.tell() == fsize:
|
||||
form.add_field('done', '1')
|
||||
form.add_field('offset', str(done))
|
||||
try:
|
||||
data = api._json_request(uploadUrl, form)
|
||||
except KeyboardInterrupt:
|
||||
print("\ninterrupted by user.")
|
||||
sys.exit(1)
|
||||
except:
|
||||
print("uploading chunk failed, will try again in 5 seconds\r", end='')
|
||||
sys.stdout.flush()
|
||||
data = {'result': -1}
|
||||
time.sleep(5)
|
||||
if data and 'status' in data:
|
||||
if data['status']['code'] == 403:
|
||||
print("login required")
|
||||
return False
|
||||
if data['status']['code'] != 200:
|
||||
print("request returned error, will try again in 5 seconds")
|
||||
if DEBUG:
|
||||
print(data)
|
||||
time.sleep(5)
|
||||
if data and data.get('result') == 1:
|
||||
done += len(chunk)
|
||||
if data.get('offset') not in (None, done):
|
||||
print('server offset out of sync, continue from', data['offset'])
|
||||
done = data['offset']
|
||||
f.seek(done)
|
||||
chunk = f.read(CHUNK_SIZE)
|
||||
if data and 'result' in data and data.get('result') == 1:
|
||||
return data.get('id', True)
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
def upload_document(pdf):
|
||||
document_id = upload_chunks(api, url, pdf)
|
||||
|
||||
|
||||
def import_url(url):
|
||||
meta = {
|
||||
'description': 'PDF snapshot of %s from %s' % (url, datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
|
||||
'title': 'Untitled'
|
||||
}
|
||||
data = ox.cache.read_url(url, unicode=True)
|
||||
title = re.compile('<title>(.*?)</title>').findall(data)
|
||||
if title:
|
||||
meta['title'] = title[0]
|
||||
author= re.compile('<meta name="author" content="(.*?)"').findall(data)
|
||||
if author:
|
||||
meta['author'] = author
|
||||
fd, pdf = tempfile.mkstemp('.pdf')
|
||||
if url2pdf(url, pdf):
|
||||
url = api.url + 'upload/document/'
|
||||
did = upload_chunks(api, url, pdf, {
|
||||
'filename': meta['title'] + '.pdf'
|
||||
})
|
||||
meta['id'] = did
|
||||
r = api.editDocument(meta)
|
||||
print(r['data']['id'])
|
||||
os.unlink(pdf)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
url = sys.argv[1]
|
||||
import_url(url)
|
||||
Loading…
Add table
Add a link
Reference in a new issue