43 lines
1.1 KiB
Python
43 lines
1.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
import os
|
|
import re
|
|
import subprocess
|
|
|
|
import ox
|
|
|
|
from item.utils import sort_title, sort_string, get_by_id
|
|
|
|
def pdfpages(pdf):
|
|
return int(pdfinfo(pdf).get('pages', '0'))
|
|
|
|
def pdfinfo(pdf):
|
|
cmd = ['pdfinfo', pdf]
|
|
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
|
|
stdout, stderr = p.communicate()
|
|
data = {}
|
|
for line in stdout.decode('utf-8', 'replace').strip().split('\n'):
|
|
parts = line.split(':')
|
|
key = parts[0].lower().strip()
|
|
if key:
|
|
data[key] = ':'.join(parts[1:]).strip()
|
|
return data
|
|
|
|
def extract_pdfpage(pdf, image, page):
|
|
page = str(page)
|
|
cmd = [
|
|
'pdftocairo',
|
|
'-cropbox',
|
|
'-jpeg',
|
|
'-f', page, '-l', page,
|
|
'-singlefile',
|
|
'-scale-to', '2048',
|
|
pdf,
|
|
os.path.splitext(image)[0]
|
|
]
|
|
subprocess.call(cmd, stdout=open('/dev/null', 'wb'))
|
|
return image
|
|
|
|
def get_documents(text):
|
|
ids = re.compile('/documents/([A-Z]+)').findall(text)
|
|
ids += re.compile('/document/([A-Z]+)').findall(text)
|
|
return [ox.fromAZ(id) for id in set(ids)]
|