openmedialibrary/oml/media/pdf.py
2024-06-09 14:47:36 +01:00

326 lines
10 KiB
Python

# -*- coding: utf-8 -*-
import sys
import tempfile
import subprocess
import os
import shutil
from glob import glob
from datetime import datetime
from PyPDF2 import PdfFileReader
from PIL import Image
import ox
import settings
from utils import get_language, to_isbn13, find_isbns, get_short_path_name
import logging
logger = logging.getLogger(__name__)
def cover(pdf):
if sys.platform == 'darwin':
return ql_cover(pdf)
else:
return page(pdf, 1)
def ql_cover(pdf, size=1024):
tmp = tempfile.mkdtemp()
cmd = [
'qlmanage',
'-t',
'-s',
str(size),
'-o',
tmp,
pdf
]
p = subprocess.Popen(cmd, close_fds=True)
p.wait()
image = glob('%s/*' % tmp)
if image:
image = image[0]
with open(image, 'rb') as fd:
data = fd.read()
else:
logger.debug('qlmanage did not create cover for %s', pdf)
data = None
shutil.rmtree(tmp)
return data
def page(pdf, page, size=1024):
tmp = tempfile.mkdtemp()
if sys.platform == 'win32':
pdf = get_short_path_name(pdf)
cmd = [
'pdftocairo',
pdf,
'-jpeg',
'-f', str(page), '-l', str(page),
'-scale-to', str(size), '-cropbox',
os.path.join(tmp, 'page')
]
if sys.platform == 'win32':
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
startupinfo.wShowWindow = subprocess.SW_HIDE
p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo)
else:
p = subprocess.Popen(cmd, close_fds=True)
p.wait()
image = glob('%s/*' % tmp)
if image:
image = image[0]
with open(image, 'rb') as fd:
data = fd.read()
else:
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
data = None
shutil.rmtree(tmp)
return data
def crop(pdf, page, left, top, right, bottom):
size = 2048
tmp = tempfile.mkdtemp()
if sys.platform == 'win32':
pdf = get_short_path_name(pdf)
cmd = [
'pdftocairo',
pdf,
'-jpeg',
'-f', str(page), '-l', str(page),
'-scale-to', str(size), '-cropbox',
os.path.join(tmp, 'page')
]
if sys.platform == 'win32':
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
startupinfo.wShowWindow = subprocess.SW_HIDE
p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo)
else:
p = subprocess.Popen(cmd, close_fds=True)
p.wait()
image = glob('%s/*' % tmp)
if image:
image = image[0]
crop = [int(p) for p in (left, top, right, bottom)]
print(crop)
img = Image.open(image).crop(crop)
img.save(image)
with open(image, 'rb') as fd:
data = fd.read()
else:
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
data = None
shutil.rmtree(tmp)
return data
'''
def page(pdf, page):
image = tempfile.mkstemp('.jpg')[1]
cmd = [
'gs', '-q',
'-dBATCH', '-dSAFER', '-dNOPAUSE', '-dNOPROMPT',
'-dMaxBitmap=500000000',
'-dAlignToPixels=0', '-dGridFitTT=2',
'-sDEVICE=jpeg', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4',
'-r72',
'-dUseCropBox',
'-dFirstPage=%d' % page,
'-dLastPage=%d' % page,
'-sOutputFile=%s' % image,
pdf
]
p = subprocess.Popen(cmd, close_fds=True)
p.wait()
with open(image, 'rb') as fd:
data = fd.read()
os.unlink(image)
return data
'''
def parse_tableofcontents(reader):
titles = []
try:
if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']:
outlines = reader.trailer['/Root']['/Outlines']
if '/First' in outlines:
title = outlines['/First']
while title:
if '/Title' in title:
titles.append(title['/Title'])
if '/Next' in title:
title = title['/Next']
else:
title = None
except:
logger.debug('failed to parse pdf outline', exc_info=True)
try:
titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles]
toc = '\n'.join(titles).strip()
except:
logger.debug('failed to decode outline', exc_info=True)
titles = []
return toc
def info(pdf):
data = {}
with open(pdf, 'rb') as fd:
try:
pdfreader = PdfFileReader(fd)
data['pages'] = pdfreader.numPages
if pdfreader.getIsEncrypted():
pdfreader.decrypt('')
toc = parse_tableofcontents(pdfreader)
if toc:
data['tableofcontents'] = toc
try:
info = pdfreader.getDocumentInfo()
except:
info = None
if info:
for key in info:
if info[key]:
try:
value = info[key]
if len(value) == 1:
value = value[0]
if isinstance(value, bytes):
value = value.decode('utf-16')
data[key[1:].lower()] = value
except:
pass
try:
xmp = pdfreader.getXmpMetadata()
except:
xmp = None
if xmp:
for key in dir(xmp):
if key.startswith('dc_'):
try:
value = getattr(xmp, key)
except:
continue
if isinstance(value, dict) and 'x-default' in value:
value = value['x-default']
elif isinstance(value, list):
value = [v.strip() if isinstance(v, str) else v for v in value if v]
value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value]
if len(value) == 1:
value = value[0]
_key = key[3:]
if value and _key not in data:
data[_key] = value
except:
logger.debug('FAILED TO PARSE %s', pdf, exc_info=True)
'''
cmd = ['pdfinfo', pdf]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
for line in stdout.strip().split('\n'):
parts = line.split(':')
key = parts[0].lower().strip()
if key:
data[key] = ':'.join(parts[1:]).strip()
for key in data.keys():
if not data[key]:
del data[key]
'''
if 'identifier' in data:
value = to_isbn13(data['identifier'])
if value:
data['isbn'] = value
del data['identifier']
for key, value in data.items():
if isinstance(value, dict):
value = ' '.join(list(value.values()))
data[key] = value.strip()
for key in list(data):
if data[key] in ('Unknown',):
del data[key]
if key == 'language':
data[key] = get_language(data[key])
if settings.server['extract_text']:
text = extract_text(pdf)
data['textsize'] = len(text)
if 'isbn' not in data:
isbn = extract_isbn(text)
if isbn:
data['isbn'] = isbn
if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
d = data['date']
data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])
if 'author' in data and isinstance(data['author'], str):
data['author'] = [ox.normalize_name(data['author'])]
if 'description' in data:
data['description'] = ox.strip_tags(ox.decode_html(data['description'])).strip()
if data.get('title', '').startswith('Microsoft Word'):
for key in ('title', 'author', 'producer', 'creator'):
if key in data:
del data[key]
for key in list(data):
if isinstance(data[key], str) and not data[key].strip():
del data[key]
return data
'''
#possbile alternative with gs
tmp = tempfile.mkstemp('.txt')[1]
cmd = ['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=txtwrite', '-dFirstPage=3', '-dLastPage=5', '-sOutputFile=%s'%tmp, pdf]
'''
def extract_text(pdf):
if sys.platform == 'win32':
pdf = get_short_path_name(pdf)
cmd = ['pdftotext', pdf, '-']
if sys.platform == 'darwin':
pdftotext = ['/usr/local/bin/pdftotext', pdf, '-']
if os.path.exists(pdftotext[0]):
cmd = pdftotext
else:
cmd = ['/usr/bin/mdimport', '-d2', pdf]
if sys.platform == 'win32':
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
startupinfo.wShowWindow = subprocess.SW_HIDE
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, startupinfo=startupinfo)
else:
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
stdout = stdout.decode()
stderr = stderr.decode()
if sys.platform == 'darwin' and cmd[0] == '/usr/bin/mdimport':
if 'kMDItemTextContent' in stderr:
stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2]
else:
stdout = ''
return stdout.strip()
def pypdf_extract_text(path):
'''
slow and bad results
'''
pdf = PdfFileReader(path)
content = []
for i in range(0, pdf.getNumPages()):
try:
extracted_text = pdf.getPage(i).extractText()
content.append(extracted_text)
except:
pass
content = "\n".join(content).replace("\xa0", " ").strip()
return content
def extract_isbn(text):
isbns = find_isbns(text)
if isbns:
return isbns[0]