258 lines
7.8 KiB
Python
258 lines
7.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
|
|
|
|
import sys
|
|
import tempfile
|
|
import subprocess
|
|
import os
|
|
import shutil
|
|
from glob import glob
|
|
from datetime import datetime
|
|
|
|
from PyPDF2 import PdfFileReader
|
|
import ox
|
|
|
|
import settings
|
|
from utils import get_language, to_isbn13, find_isbns
|
|
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def cover(pdf):
|
|
if sys.platform == 'darwin':
|
|
return ql_cover(pdf)
|
|
else:
|
|
return page(pdf, 1)
|
|
|
|
def ql_cover(pdf):
|
|
tmp = tempfile.mkdtemp()
|
|
cmd = [
|
|
'qlmanage',
|
|
'-t',
|
|
'-s',
|
|
'1024',
|
|
'-o',
|
|
tmp,
|
|
pdf
|
|
]
|
|
p = subprocess.Popen(cmd, close_fds=True)
|
|
p.wait()
|
|
image = glob('%s/*' % tmp)
|
|
if image:
|
|
image = image[0]
|
|
with open(image, 'rb') as fd:
|
|
data = fd.read()
|
|
else:
|
|
logger.debug('qlmanage did not create cover for %s', pdf)
|
|
data = None
|
|
shutil.rmtree(tmp)
|
|
return data
|
|
|
|
def page(pdf, page):
|
|
tmp = tempfile.mkdtemp()
|
|
cmd = [
|
|
'pdftocairo',
|
|
pdf,
|
|
'-jpeg',
|
|
'-f', str(page), '-l', str(page),
|
|
'-scale-to', '1024', '-cropbox',
|
|
os.path.join(tmp, 'page')
|
|
]
|
|
p = subprocess.Popen(cmd, close_fds=True)
|
|
p.wait()
|
|
image = glob('%s/*' % tmp)
|
|
if image:
|
|
image = image[0]
|
|
with open(image, 'rb') as fd:
|
|
data = fd.read()
|
|
else:
|
|
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
|
|
data = None
|
|
shutil.rmtree(tmp)
|
|
return data
|
|
|
|
'''
|
|
def page(pdf, page):
|
|
image = tempfile.mkstemp('.jpg')[1]
|
|
cmd = [
|
|
'gs', '-q',
|
|
'-dBATCH', '-dSAFER', '-dNOPAUSE', '-dNOPROMPT',
|
|
'-dMaxBitmap=500000000',
|
|
'-dAlignToPixels=0', '-dGridFitTT=2',
|
|
'-sDEVICE=jpeg', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4',
|
|
'-r72',
|
|
'-dUseCropBox',
|
|
'-dFirstPage=%d' % page,
|
|
'-dLastPage=%d' % page,
|
|
'-sOutputFile=%s' % image,
|
|
pdf
|
|
]
|
|
p = subprocess.Popen(cmd, close_fds=True)
|
|
p.wait()
|
|
with open(image, 'rb') as fd:
|
|
data = fd.read()
|
|
os.unlink(image)
|
|
return data
|
|
'''
|
|
|
|
def parse_tableofcontents(reader):
|
|
titles = []
|
|
try:
|
|
if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']:
|
|
outlines = reader.trailer['/Root']['/Outlines']
|
|
if '/First' in outlines:
|
|
title = outlines['/First']
|
|
while title:
|
|
if '/Title' in title:
|
|
titles.append(title['/Title'])
|
|
if '/Next' in title:
|
|
title = title['/Next']
|
|
else:
|
|
title = None
|
|
except:
|
|
logger.debug('failed to parse pdf outline', exc_info=True)
|
|
try:
|
|
titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles]
|
|
toc = '\n'.join(titles).strip()
|
|
except:
|
|
logger.debug('failed to decode outline', exc_info=True)
|
|
titles = []
|
|
return toc
|
|
|
|
def info(pdf):
|
|
data = {}
|
|
with open(pdf, 'rb') as fd:
|
|
try:
|
|
pdfreader = PdfFileReader(fd)
|
|
data['pages'] = pdfreader.numPages
|
|
if pdfreader.getIsEncrypted():
|
|
pdfreader.decrypt('')
|
|
toc = parse_tableofcontents(pdfreader)
|
|
if toc:
|
|
data['tableofcontents'] = toc
|
|
try:
|
|
info = pdfreader.getDocumentInfo()
|
|
except:
|
|
info = None
|
|
if info:
|
|
for key in info:
|
|
if info[key]:
|
|
try:
|
|
value = info[key]
|
|
if len(value) == 1:
|
|
value = value[0]
|
|
if isinstance(value, bytes):
|
|
value = value.decode('utf-16')
|
|
data[key[1:].lower()] = value
|
|
except:
|
|
pass
|
|
try:
|
|
xmp = pdfreader.getXmpMetadata()
|
|
except:
|
|
xmp = None
|
|
if xmp:
|
|
for key in dir(xmp):
|
|
if key.startswith('dc_'):
|
|
try:
|
|
value = getattr(xmp, key)
|
|
except:
|
|
continue
|
|
if isinstance(value, dict) and 'x-default' in value:
|
|
value = value['x-default']
|
|
elif isinstance(value, list):
|
|
value = [v.strip() if isinstance(v, str) else v for v in value if v]
|
|
value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value]
|
|
if len(value) == 1:
|
|
value = value[0]
|
|
_key = key[3:]
|
|
if value and _key not in data:
|
|
data[_key] = value
|
|
except:
|
|
logger.debug('FAILED TO PARSE %s', pdf, exc_info=True)
|
|
|
|
'''
|
|
cmd = ['pdfinfo', pdf]
|
|
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
|
|
stdout, stderr = p.communicate()
|
|
for line in stdout.strip().split('\n'):
|
|
parts = line.split(':')
|
|
key = parts[0].lower().strip()
|
|
if key:
|
|
data[key] = ':'.join(parts[1:]).strip()
|
|
for key in data.keys():
|
|
if not data[key]:
|
|
del data[key]
|
|
'''
|
|
if 'identifier' in data:
|
|
value = to_isbn13(data['identifier'])
|
|
if value:
|
|
data['isbn'] = value
|
|
del data['identifier']
|
|
for key, value in data.items():
|
|
if isinstance(value, dict):
|
|
value = ' '.join(list(value.values()))
|
|
data[key] = value.strip()
|
|
for key in list(data):
|
|
if data[key] in ('Unknown',):
|
|
del data[key]
|
|
if key == 'language':
|
|
data[key] = get_language(data[key])
|
|
text = extract_text(pdf)
|
|
data['textsize'] = len(text)
|
|
if settings.server['extract_text']:
|
|
if not 'isbn' in data:
|
|
isbn = extract_isbn(text)
|
|
if isbn:
|
|
data['isbn'] = isbn
|
|
if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
|
|
d = data['date']
|
|
data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])
|
|
if 'author' in data and isinstance(data['author'], str):
|
|
data['author'] = [ox.normalize_name(data['author'])]
|
|
if 'description' in data:
|
|
data['description'] = ox.strip_tags(ox.decode_html(data['description'])).strip()
|
|
return data
|
|
|
|
'''
|
|
#possbile alternative with gs
|
|
tmp = tempfile.mkstemp('.txt')[1]
|
|
cmd = ['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=txtwrite', '-dFirstPage=3', '-dLastPage=5', '-sOutputFile=%s'%tmp, pdf]
|
|
|
|
'''
|
|
def extract_text(pdf):
|
|
if sys.platform == 'darwin':
|
|
cmd = ['/usr/bin/mdimport', '-d2', pdf]
|
|
else:
|
|
cmd = ['pdftotext', pdf, '-']
|
|
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
|
|
stdout, stderr = p.communicate()
|
|
stdout = stdout.decode()
|
|
stderr = stderr.decode()
|
|
if sys.platform == 'darwin':
|
|
if 'kMDItemTextContent' in stderr:
|
|
stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2]
|
|
else:
|
|
stdout = ''
|
|
return stdout.strip()
|
|
|
|
|
|
def pypdf_extract_text(path):
|
|
'''
|
|
slow and bad results
|
|
'''
|
|
pdf = PdfFileReader(path)
|
|
content = []
|
|
for i in range(0, pdf.getNumPages()):
|
|
try:
|
|
extracted_text = pdf.getPage(i).extractText()
|
|
content.append(extracted_text)
|
|
except:
|
|
pass
|
|
content = "\n".join(content).replace("\xa0", " ").strip()
|
|
return content
|
|
|
|
def extract_isbn(text):
|
|
isbns = find_isbns(text)
|
|
if isbns:
|
|
return isbns[0]
|