openmedialibrary/oml/media/pdf.py

281 lines
8.8 KiB
Python
Raw Normal View History

2014-05-04 17:26:43 +00:00
# -*- coding: utf-8 -*-
2014-09-02 22:32:44 +00:00
2014-05-04 17:26:43 +00:00
import sys
import tempfile
import subprocess
import os
import shutil
from glob import glob
2015-12-24 15:00:14 +00:00
from datetime import datetime
2014-05-04 17:26:43 +00:00
2014-09-08 18:46:09 +00:00
from PyPDF2 import PdfFileReader
2016-01-08 10:44:09 +00:00
import ox
2014-05-04 17:26:43 +00:00
import settings
2016-01-31 17:28:53 +00:00
from utils import get_language, to_isbn13, find_isbns, get_short_path_name
2014-05-04 17:26:43 +00:00
2014-05-18 03:01:24 +00:00
import logging
2015-11-29 14:56:38 +00:00
logger = logging.getLogger(__name__)
2014-05-18 03:01:24 +00:00
2014-05-04 17:26:43 +00:00
def cover(pdf):
if sys.platform == 'darwin':
return ql_cover(pdf)
else:
return page(pdf, 1)
def ql_cover(pdf):
2014-05-25 12:44:07 +00:00
tmp = tempfile.mkdtemp()
cmd = [
'qlmanage',
'-t',
'-s',
'1024',
'-o',
tmp,
pdf
]
2014-08-22 16:49:11 +00:00
p = subprocess.Popen(cmd, close_fds=True)
2014-05-25 12:44:07 +00:00
p.wait()
image = glob('%s/*' % tmp)
if image:
image = image[0]
with open(image, 'rb') as fd:
data = fd.read()
else:
logger.debug('qlmanage did not create cover for %s', pdf)
data = None
shutil.rmtree(tmp)
return data
def page(pdf, page):
tmp = tempfile.mkdtemp()
2016-01-31 17:28:53 +00:00
if sys.platform == 'win32':
pdf = get_short_path_name(pdf)
2014-05-25 12:44:07 +00:00
cmd = [
'pdftocairo',
pdf,
'-jpeg',
'-f', str(page), '-l', str(page),
'-scale-to', '1024', '-cropbox',
os.path.join(tmp, 'page')
]
2016-01-31 19:19:25 +00:00
if sys.platform == 'win32':
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
startupinfo.wShowWindow = subprocess.SW_HIDE
p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo)
else:
p = subprocess.Popen(cmd, close_fds=True)
2014-05-25 12:44:07 +00:00
p.wait()
image = glob('%s/*' % tmp)
if image:
image = image[0]
with open(image, 'rb') as fd:
data = fd.read()
else:
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
data = None
shutil.rmtree(tmp)
return data
2014-05-04 17:26:43 +00:00
2014-05-25 12:44:07 +00:00
'''
2014-05-04 17:26:43 +00:00
def page(pdf, page):
image = tempfile.mkstemp('.jpg')[1]
cmd = [
'gs', '-q',
'-dBATCH', '-dSAFER', '-dNOPAUSE', '-dNOPROMPT',
'-dMaxBitmap=500000000',
'-dAlignToPixels=0', '-dGridFitTT=2',
'-sDEVICE=jpeg', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4',
'-r72',
'-dUseCropBox',
'-dFirstPage=%d' % page,
'-dLastPage=%d' % page,
'-sOutputFile=%s' % image,
pdf
]
2014-08-22 16:49:11 +00:00
p = subprocess.Popen(cmd, close_fds=True)
2014-05-04 17:26:43 +00:00
p.wait()
with open(image, 'rb') as fd:
data = fd.read()
os.unlink(image)
return data
2014-05-25 12:44:07 +00:00
'''
2014-05-04 17:26:43 +00:00
2016-01-12 09:27:08 +00:00
def parse_tableofcontents(reader):
titles = []
try:
if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']:
outlines = reader.trailer['/Root']['/Outlines']
if '/First' in outlines:
title = outlines['/First']
while title:
if '/Title' in title:
titles.append(title['/Title'])
if '/Next' in title:
title = title['/Next']
else:
title = None
except:
logger.debug('failed to parse pdf outline', exc_info=True)
try:
titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles]
toc = '\n'.join(titles).strip()
except:
logger.debug('failed to decode outline', exc_info=True)
titles = []
return toc
2014-05-04 17:26:43 +00:00
def info(pdf):
data = {}
with open(pdf, 'rb') as fd:
try:
pdfreader = PdfFileReader(fd)
2014-05-18 23:24:04 +00:00
data['pages'] = pdfreader.numPages
if pdfreader.getIsEncrypted():
pdfreader.decrypt('')
2016-01-12 09:27:08 +00:00
toc = parse_tableofcontents(pdfreader)
if toc:
data['tableofcontents'] = toc
2016-01-25 10:21:54 +00:00
try:
info = pdfreader.getDocumentInfo()
except:
info = None
2014-05-04 17:26:43 +00:00
if info:
for key in info:
if info[key]:
2014-11-15 00:57:49 +00:00
try:
2015-12-24 15:00:14 +00:00
value = info[key]
if len(value) == 1:
value = value[0]
if isinstance(value, bytes):
value = value.decode('utf-16')
2015-12-25 08:03:32 +00:00
data[key[1:].lower()] = value
2014-11-15 00:57:49 +00:00
except:
pass
2016-01-25 10:21:54 +00:00
try:
xmp = pdfreader.getXmpMetadata()
except:
xmp = None
2014-05-04 17:26:43 +00:00
if xmp:
for key in dir(xmp):
if key.startswith('dc_'):
2016-01-13 06:03:47 +00:00
try:
value = getattr(xmp, key)
except:
continue
2014-05-04 17:26:43 +00:00
if isinstance(value, dict) and 'x-default' in value:
value = value['x-default']
elif isinstance(value, list):
2015-12-24 15:00:14 +00:00
value = [v.strip() if isinstance(v, str) else v for v in value if v]
value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value]
if len(value) == 1:
value = value[0]
2014-05-04 17:26:43 +00:00
_key = key[3:]
if value and _key not in data:
data[_key] = value
except:
2016-01-24 09:13:03 +00:00
logger.debug('FAILED TO PARSE %s', pdf, exc_info=True)
2015-12-25 08:03:32 +00:00
2014-05-04 17:26:43 +00:00
'''
cmd = ['pdfinfo', pdf]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2014-05-04 17:26:43 +00:00
stdout, stderr = p.communicate()
for line in stdout.strip().split('\n'):
parts = line.split(':')
key = parts[0].lower().strip()
if key:
data[key] = ':'.join(parts[1:]).strip()
for key in data.keys():
if not data[key]:
del data[key]
'''
2014-05-26 23:45:29 +00:00
if 'identifier' in data:
value = to_isbn13(data['identifier'])
if value:
data['isbn'] = value
2014-05-26 23:45:29 +00:00
del data['identifier']
2014-09-02 22:32:44 +00:00
for key, value in data.items():
2014-05-26 23:45:29 +00:00
if isinstance(value, dict):
2014-09-02 22:32:44 +00:00
value = ' '.join(list(value.values()))
2015-12-25 08:03:32 +00:00
data[key] = value.strip()
for key in list(data):
if data[key] in ('Unknown',):
del data[key]
if key == 'language':
2015-12-25 14:10:49 +00:00
data[key] = get_language(data[key])
2014-05-04 17:26:43 +00:00
if settings.server['extract_text']:
text = extract_text(pdf)
data['textsize'] = len(text)
2019-01-15 07:50:11 +00:00
if 'isbn' not in data:
2014-05-04 17:26:43 +00:00
isbn = extract_isbn(text)
if isbn:
data['isbn'] = isbn
2014-05-27 09:09:06 +00:00
if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
d = data['date']
data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])
2015-12-25 14:53:22 +00:00
if 'author' in data and isinstance(data['author'], str):
2016-01-08 10:44:09 +00:00
data['author'] = [ox.normalize_name(data['author'])]
2016-01-29 16:47:39 +00:00
if 'description' in data:
data['description'] = ox.strip_tags(ox.decode_html(data['description'])).strip()
if data.get('title', '').startswith('Microsoft Word'):
for key in ('title', 'author', 'producer', 'creator'):
if key in data:
del data[key]
2016-03-18 17:35:41 +00:00
for key in list(data):
if isinstance(data[key], str) and not data[key].strip():
del data[key]
2014-05-04 17:26:43 +00:00
return data
'''
#possbile alternative with gs
tmp = tempfile.mkstemp('.txt')[1]
cmd = ['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=txtwrite', '-dFirstPage=3', '-dLastPage=5', '-sOutputFile=%s'%tmp, pdf]
'''
def extract_text(pdf):
2016-02-07 11:41:00 +00:00
if sys.platform == 'win32':
pdf = get_short_path_name(pdf)
2014-05-04 17:26:43 +00:00
if sys.platform == 'darwin':
2014-05-16 17:08:10 +00:00
cmd = ['/usr/bin/mdimport', '-d2', pdf]
2014-05-04 17:26:43 +00:00
else:
cmd = ['pdftotext', pdf, '-']
2016-01-31 19:19:25 +00:00
if sys.platform == 'win32':
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
startupinfo.wShowWindow = subprocess.SW_HIDE
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, startupinfo=startupinfo)
2016-02-07 11:41:00 +00:00
else:
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2014-05-04 17:26:43 +00:00
stdout, stderr = p.communicate()
2014-09-30 20:30:09 +00:00
stdout = stdout.decode()
stderr = stderr.decode()
2014-05-04 17:26:43 +00:00
if sys.platform == 'darwin':
2014-05-20 00:08:28 +00:00
if 'kMDItemTextContent' in stderr:
stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2]
else:
stdout = ''
2014-05-04 17:26:43 +00:00
return stdout.strip()
2016-01-19 15:18:25 +00:00
def pypdf_extract_text(path):
'''
slow and bad results
'''
pdf = PdfFileReader(path)
content = []
for i in range(0, pdf.getNumPages()):
try:
extracted_text = pdf.getPage(i).extractText()
content.append(extracted_text)
except:
pass
content = "\n".join(content).replace("\xa0", " ").strip()
return content
2014-05-04 17:26:43 +00:00
def extract_isbn(text):
isbns = find_isbns(text)
if isbns:
return isbns[0]