openmedialibrary/oml/media/pdf.py

192 lines
5.5 KiB
Python

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import sys
import tempfile
import subprocess
import os
import shutil
from glob import glob
from PyPDF2 import PdfFileReader
import stdnum.isbn
import settings
from utils import normalize_isbn, find_isbns
import logging
logger = logging.getLogger(__name__)
def cover(pdf):
if sys.platform == 'darwin':
return ql_cover(pdf)
else:
return page(pdf, 1)
def ql_cover(pdf):
tmp = tempfile.mkdtemp()
cmd = [
'qlmanage',
'-t',
'-s',
'1024',
'-o',
tmp,
pdf
]
p = subprocess.Popen(cmd, close_fds=True)
p.wait()
image = glob('%s/*' % tmp)
if image:
image = image[0]
with open(image, 'rb') as fd:
data = fd.read()
else:
logger.debug('qlmanage did not create cover for %s', pdf)
data = None
shutil.rmtree(tmp)
return data
def page(pdf, page):
tmp = tempfile.mkdtemp()
cmd = [
'pdftocairo',
pdf,
'-jpeg',
'-f', str(page), '-l', str(page),
'-scale-to', '1024', '-cropbox',
os.path.join(tmp, 'page')
]
p = subprocess.Popen(cmd, close_fds=True)
p.wait()
image = glob('%s/*' % tmp)
if image:
image = image[0]
with open(image, 'rb') as fd:
data = fd.read()
else:
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
data = None
shutil.rmtree(tmp)
return data
'''
def page(pdf, page):
image = tempfile.mkstemp('.jpg')[1]
cmd = [
'gs', '-q',
'-dBATCH', '-dSAFER', '-dNOPAUSE', '-dNOPROMPT',
'-dMaxBitmap=500000000',
'-dAlignToPixels=0', '-dGridFitTT=2',
'-sDEVICE=jpeg', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4',
'-r72',
'-dUseCropBox',
'-dFirstPage=%d' % page,
'-dLastPage=%d' % page,
'-sOutputFile=%s' % image,
pdf
]
p = subprocess.Popen(cmd, close_fds=True)
p.wait()
with open(image, 'rb') as fd:
data = fd.read()
os.unlink(image)
return data
'''
def info(pdf):
data = {}
with open(pdf, 'rb') as fd:
try:
pdfreader = PdfFileReader(fd)
data['pages'] = pdfreader.numPages
if pdfreader.getIsEncrypted():
pdfreader.decrypt('')
info = pdfreader.getDocumentInfo()
if info:
for key in info:
if info[key]:
try:
if isinstance(info[key], bytes):
info[key] = info[key].decode('utf-16')
data[key[1:].lower()] = info[key]
except:
pass
xmp = pdfreader.getXmpMetadata()
if xmp:
for key in dir(xmp):
if key.startswith('dc_'):
value = getattr(xmp, key)
if isinstance(value, dict) and 'x-default' in value:
value = value['x-default']
elif isinstance(value, list):
value = [v.strip() if isinstance(v, str) else v for v in value if v.strip()]
_key = key[3:]
if value and _key not in data:
data[_key] = value
except:
logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)
'''
cmd = ['pdfinfo', pdf]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
for line in stdout.strip().split('\n'):
parts = line.split(':')
key = parts[0].lower().strip()
if key:
data[key] = ':'.join(parts[1:]).strip()
for key in data.keys():
if not data[key]:
del data[key]
'''
if 'identifier' in data:
value = normalize_isbn(data['identifier'])
if stdnum.isbn.is_valid(value):
data['isbn'] = [value]
del data['identifier']
for key, value in data.items():
if isinstance(value, dict):
value = ' '.join(list(value.values()))
data[key] = value
text = extract_text(pdf)
data['textsize'] = len(text)
if settings.server['extract_text']:
if not 'isbn' in data:
isbn = extract_isbn(text)
if isbn:
data['isbn'] = [isbn]
if 'isbn' in data and isinstance(data['isbn'], str):
data['isbn'] = [data['isbn']]
if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
d = data['date']
data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])
return data
'''
#possbile alternative with gs
tmp = tempfile.mkstemp('.txt')[1]
cmd = ['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=txtwrite', '-dFirstPage=3', '-dLastPage=5', '-sOutputFile=%s'%tmp, pdf]
'''
def extract_text(pdf):
if sys.platform == 'darwin':
cmd = ['/usr/bin/mdimport', '-d2', pdf]
else:
cmd = ['pdftotext', pdf, '-']
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
stdout = stdout.decode()
stderr = stderr.decode()
if sys.platform == 'darwin':
if 'kMDItemTextContent' in stderr:
stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2]
else:
stdout = ''
return stdout.strip()
def extract_isbn(text):
isbns = find_isbns(text)
if isbns:
return isbns[0]