# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import sys import tempfile import subprocess import os import shutil from glob import glob from datetime import datetime from PyPDF2 import PdfFileReader import stdnum.isbn import ox import settings from utils import normalize_isbn, find_isbns, get_language import logging logger = logging.getLogger(__name__) def cover(pdf): if sys.platform == 'darwin': return ql_cover(pdf) else: return page(pdf, 1) def ql_cover(pdf): tmp = tempfile.mkdtemp() cmd = [ 'qlmanage', '-t', '-s', '1024', '-o', tmp, pdf ] p = subprocess.Popen(cmd, close_fds=True) p.wait() image = glob('%s/*' % tmp) if image: image = image[0] with open(image, 'rb') as fd: data = fd.read() else: logger.debug('qlmanage did not create cover for %s', pdf) data = None shutil.rmtree(tmp) return data def page(pdf, page): tmp = tempfile.mkdtemp() cmd = [ 'pdftocairo', pdf, '-jpeg', '-f', str(page), '-l', str(page), '-scale-to', '1024', '-cropbox', os.path.join(tmp, 'page') ] p = subprocess.Popen(cmd, close_fds=True) p.wait() image = glob('%s/*' % tmp) if image: image = image[0] with open(image, 'rb') as fd: data = fd.read() else: logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd)) data = None shutil.rmtree(tmp) return data ''' def page(pdf, page): image = tempfile.mkstemp('.jpg')[1] cmd = [ 'gs', '-q', '-dBATCH', '-dSAFER', '-dNOPAUSE', '-dNOPROMPT', '-dMaxBitmap=500000000', '-dAlignToPixels=0', '-dGridFitTT=2', '-sDEVICE=jpeg', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4', '-r72', '-dUseCropBox', '-dFirstPage=%d' % page, '-dLastPage=%d' % page, '-sOutputFile=%s' % image, pdf ] p = subprocess.Popen(cmd, close_fds=True) p.wait() with open(image, 'rb') as fd: data = fd.read() os.unlink(image) return data ''' def info(pdf): data = {} with open(pdf, 'rb') as fd: try: pdfreader = PdfFileReader(fd) data['pages'] = pdfreader.numPages if pdfreader.getIsEncrypted(): pdfreader.decrypt('') info = pdfreader.getDocumentInfo() if info: for key in info: if info[key]: try: value = info[key] if len(value) == 1: value = value[0] if isinstance(value, bytes): value = value.decode('utf-16') data[key[1:].lower()] = value except: pass xmp = pdfreader.getXmpMetadata() if xmp: for key in dir(xmp): if key.startswith('dc_'): value = getattr(xmp, key) if isinstance(value, dict) and 'x-default' in value: value = value['x-default'] elif isinstance(value, list): value = [v.strip() if isinstance(v, str) else v for v in value if v] value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value] if len(value) == 1: value = value[0] _key = key[3:] if value and _key not in data: data[_key] = value except: logger.debug('FAILED TO PARSE %s', pdf, exc_info=1) ''' cmd = ['pdfinfo', pdf] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() for line in stdout.strip().split('\n'): parts = line.split(':') key = parts[0].lower().strip() if key: data[key] = ':'.join(parts[1:]).strip() for key in data.keys(): if not data[key]: del data[key] ''' if 'identifier' in data: value = normalize_isbn(data['identifier']) if stdnum.isbn.is_valid(value): data['isbn'] = [value] del data['identifier'] for key, value in data.items(): if isinstance(value, dict): value = ' '.join(list(value.values())) data[key] = value.strip() for key in list(data): if data[key] in ('Unknown',): del data[key] if key == 'language': data[key] = get_language(data[key]) text = extract_text(pdf) data['textsize'] = len(text) if settings.server['extract_text']: if not 'isbn' in data: isbn = extract_isbn(text) if isbn: data['isbn'] = [isbn] if 'isbn' in data and isinstance(data['isbn'], str): data['isbn'] = [data['isbn']] if 'date' in data and len(data['date']) == 8 and data['date'].isdigit(): d = data['date'] data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:]) if 'author' in data and isinstance(data['author'], str): data['author'] = [ox.normalize_name(data['author'])] return data ''' #possbile alternative with gs tmp = tempfile.mkstemp('.txt')[1] cmd = ['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=txtwrite', '-dFirstPage=3', '-dLastPage=5', '-sOutputFile=%s'%tmp, pdf] ''' def extract_text(pdf): if sys.platform == 'darwin': cmd = ['/usr/bin/mdimport', '-d2', pdf] else: cmd = ['pdftotext', pdf, '-'] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() stdout = stdout.decode() stderr = stderr.decode() if sys.platform == 'darwin': if 'kMDItemTextContent' in stderr: stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2] else: stdout = '' return stdout.strip() def extract_isbn(text): isbns = find_isbns(text) if isbns: return isbns[0]