2014-05-04 17:26:43 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
2014-09-02 22:32:44 +00:00
|
|
|
|
2014-05-04 17:26:43 +00:00
|
|
|
|
|
|
|
import sys
|
|
|
|
import tempfile
|
|
|
|
import subprocess
|
|
|
|
import os
|
|
|
|
import shutil
|
|
|
|
from glob import glob
|
|
|
|
|
2014-09-08 18:46:09 +00:00
|
|
|
from PyPDF2 import PdfFileReader
|
2014-05-04 17:26:43 +00:00
|
|
|
import stdnum.isbn
|
|
|
|
|
|
|
|
import settings
|
|
|
|
from utils import normalize_isbn, find_isbns
|
|
|
|
|
2014-05-18 03:01:24 +00:00
|
|
|
import logging
|
|
|
|
logger = logging.getLogger('oml.meta.pdf')
|
|
|
|
|
2014-05-04 17:26:43 +00:00
|
|
|
def cover(pdf):
|
|
|
|
if sys.platform == 'darwin':
|
|
|
|
return ql_cover(pdf)
|
|
|
|
else:
|
|
|
|
return page(pdf, 1)
|
|
|
|
|
|
|
|
def ql_cover(pdf):
|
2014-05-25 12:44:07 +00:00
|
|
|
tmp = tempfile.mkdtemp()
|
|
|
|
cmd = [
|
|
|
|
'qlmanage',
|
|
|
|
'-t',
|
|
|
|
'-s',
|
|
|
|
'1024',
|
|
|
|
'-o',
|
|
|
|
tmp,
|
|
|
|
pdf
|
|
|
|
]
|
2014-08-22 16:49:11 +00:00
|
|
|
p = subprocess.Popen(cmd, close_fds=True)
|
2014-05-25 12:44:07 +00:00
|
|
|
p.wait()
|
|
|
|
image = glob('%s/*' % tmp)
|
|
|
|
if image:
|
|
|
|
image = image[0]
|
|
|
|
with open(image, 'rb') as fd:
|
|
|
|
data = fd.read()
|
|
|
|
else:
|
|
|
|
logger.debug('qlmanage did not create cover for %s', pdf)
|
|
|
|
data = None
|
|
|
|
shutil.rmtree(tmp)
|
|
|
|
return data
|
|
|
|
|
|
|
|
def page(pdf, page):
|
|
|
|
tmp = tempfile.mkdtemp()
|
|
|
|
cmd = [
|
|
|
|
'pdftocairo',
|
|
|
|
pdf,
|
|
|
|
'-jpeg',
|
|
|
|
'-f', str(page), '-l', str(page),
|
|
|
|
'-scale-to', '1024', '-cropbox',
|
|
|
|
os.path.join(tmp, 'page')
|
|
|
|
]
|
2014-08-22 16:49:11 +00:00
|
|
|
p = subprocess.Popen(cmd, close_fds=True)
|
2014-05-25 12:44:07 +00:00
|
|
|
p.wait()
|
|
|
|
image = glob('%s/*' % tmp)
|
|
|
|
if image:
|
|
|
|
image = image[0]
|
|
|
|
with open(image, 'rb') as fd:
|
|
|
|
data = fd.read()
|
|
|
|
else:
|
|
|
|
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
|
|
|
|
data = None
|
|
|
|
shutil.rmtree(tmp)
|
|
|
|
return data
|
2014-05-04 17:26:43 +00:00
|
|
|
|
2014-05-25 12:44:07 +00:00
|
|
|
'''
|
2014-05-04 17:26:43 +00:00
|
|
|
def page(pdf, page):
|
|
|
|
image = tempfile.mkstemp('.jpg')[1]
|
|
|
|
cmd = [
|
|
|
|
'gs', '-q',
|
|
|
|
'-dBATCH', '-dSAFER', '-dNOPAUSE', '-dNOPROMPT',
|
|
|
|
'-dMaxBitmap=500000000',
|
|
|
|
'-dAlignToPixels=0', '-dGridFitTT=2',
|
|
|
|
'-sDEVICE=jpeg', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4',
|
|
|
|
'-r72',
|
|
|
|
'-dUseCropBox',
|
|
|
|
'-dFirstPage=%d' % page,
|
|
|
|
'-dLastPage=%d' % page,
|
|
|
|
'-sOutputFile=%s' % image,
|
|
|
|
pdf
|
|
|
|
]
|
2014-08-22 16:49:11 +00:00
|
|
|
p = subprocess.Popen(cmd, close_fds=True)
|
2014-05-04 17:26:43 +00:00
|
|
|
p.wait()
|
|
|
|
with open(image, 'rb') as fd:
|
|
|
|
data = fd.read()
|
|
|
|
os.unlink(image)
|
|
|
|
return data
|
2014-05-25 12:44:07 +00:00
|
|
|
'''
|
2014-05-04 17:26:43 +00:00
|
|
|
|
|
|
|
def info(pdf):
|
|
|
|
data = {}
|
|
|
|
with open(pdf, 'rb') as fd:
|
|
|
|
try:
|
|
|
|
pdfreader = PdfFileReader(fd)
|
2014-05-18 23:24:04 +00:00
|
|
|
data['pages'] = pdfreader.numPages
|
2014-10-31 15:13:02 +00:00
|
|
|
if pdfreader.getIsEncrypted():
|
|
|
|
pdfreader.decrypt('')
|
2014-05-04 17:26:43 +00:00
|
|
|
info = pdfreader.getDocumentInfo()
|
|
|
|
if info:
|
|
|
|
for key in info:
|
|
|
|
if info[key]:
|
|
|
|
data[key[1:].lower()] = info[key]
|
2014-10-31 15:13:02 +00:00
|
|
|
xmp = pdfreader.getXmpMetadata()
|
2014-05-04 17:26:43 +00:00
|
|
|
if xmp:
|
|
|
|
for key in dir(xmp):
|
|
|
|
if key.startswith('dc_'):
|
|
|
|
value = getattr(xmp, key)
|
|
|
|
if isinstance(value, dict) and 'x-default' in value:
|
|
|
|
value = value['x-default']
|
|
|
|
elif isinstance(value, list):
|
|
|
|
value = [v.strip() for v in value if v.strip()]
|
|
|
|
_key = key[3:]
|
|
|
|
if value and _key not in data:
|
|
|
|
data[_key] = value
|
|
|
|
except:
|
2014-05-18 03:01:24 +00:00
|
|
|
logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)
|
2014-05-04 17:26:43 +00:00
|
|
|
|
|
|
|
'''
|
|
|
|
cmd = ['pdfinfo', pdf]
|
2014-08-22 16:49:11 +00:00
|
|
|
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
|
2014-05-04 17:26:43 +00:00
|
|
|
stdout, stderr = p.communicate()
|
|
|
|
for line in stdout.strip().split('\n'):
|
|
|
|
parts = line.split(':')
|
|
|
|
key = parts[0].lower().strip()
|
|
|
|
if key:
|
|
|
|
data[key] = ':'.join(parts[1:]).strip()
|
|
|
|
for key in data.keys():
|
|
|
|
if not data[key]:
|
|
|
|
del data[key]
|
|
|
|
'''
|
2014-05-26 23:45:29 +00:00
|
|
|
if 'identifier' in data:
|
|
|
|
value = normalize_isbn(data['identifier'])
|
|
|
|
if stdnum.isbn.is_valid(value):
|
|
|
|
data['isbn'] = [value]
|
|
|
|
del data['identifier']
|
2014-09-02 22:32:44 +00:00
|
|
|
for key, value in data.items():
|
2014-05-26 23:45:29 +00:00
|
|
|
if isinstance(value, dict):
|
2014-09-02 22:32:44 +00:00
|
|
|
value = ' '.join(list(value.values()))
|
2014-05-26 23:45:29 +00:00
|
|
|
data[key] = value
|
2014-05-04 17:26:43 +00:00
|
|
|
text = extract_text(pdf)
|
|
|
|
data['textsize'] = len(text)
|
|
|
|
if settings.server['extract_text']:
|
|
|
|
if not 'isbn' in data:
|
|
|
|
isbn = extract_isbn(text)
|
|
|
|
if isbn:
|
2014-05-21 00:02:21 +00:00
|
|
|
data['isbn'] = [isbn]
|
2014-09-02 22:32:44 +00:00
|
|
|
if 'isbn' in data and isinstance(data['isbn'], str):
|
2014-05-27 09:09:06 +00:00
|
|
|
data['isbn'] = [data['isbn']]
|
|
|
|
if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
|
|
|
|
d = data['date']
|
|
|
|
data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])
|
2014-05-04 17:26:43 +00:00
|
|
|
return data
|
|
|
|
|
|
|
|
'''
|
|
|
|
#possbile alternative with gs
|
|
|
|
tmp = tempfile.mkstemp('.txt')[1]
|
|
|
|
cmd = ['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=txtwrite', '-dFirstPage=3', '-dLastPage=5', '-sOutputFile=%s'%tmp, pdf]
|
|
|
|
|
|
|
|
'''
|
|
|
|
def extract_text(pdf):
|
|
|
|
if sys.platform == 'darwin':
|
2014-05-16 17:08:10 +00:00
|
|
|
cmd = ['/usr/bin/mdimport', '-d2', pdf]
|
2014-05-04 17:26:43 +00:00
|
|
|
else:
|
|
|
|
cmd = ['pdftotext', pdf, '-']
|
2014-08-22 16:49:11 +00:00
|
|
|
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
|
2014-05-04 17:26:43 +00:00
|
|
|
stdout, stderr = p.communicate()
|
2014-09-30 20:30:09 +00:00
|
|
|
stdout = stdout.decode()
|
|
|
|
stderr = stderr.decode()
|
2014-05-04 17:26:43 +00:00
|
|
|
if sys.platform == 'darwin':
|
2014-05-20 00:08:28 +00:00
|
|
|
if 'kMDItemTextContent' in stderr:
|
|
|
|
stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2]
|
|
|
|
else:
|
|
|
|
stdout = ''
|
2014-05-04 17:26:43 +00:00
|
|
|
return stdout.strip()
|
|
|
|
|
|
|
|
def extract_isbn(text):
|
|
|
|
isbns = find_isbns(text)
|
|
|
|
if isbns:
|
|
|
|
return isbns[0]
|