# -*- coding: utf-8 -*- import sys import tempfile import subprocess import os import shutil from glob import glob from datetime import datetime from PyPDF2 import PdfFileReader import ox import settings from utils import get_language, to_isbn13, find_isbns, get_short_path_name import logging logger = logging.getLogger(__name__) def cover(pdf): if sys.platform == 'darwin': return ql_cover(pdf) else: return page(pdf, 1) def ql_cover(pdf): tmp = tempfile.mkdtemp() cmd = [ 'qlmanage', '-t', '-s', '1024', '-o', tmp, pdf ] p = subprocess.Popen(cmd, close_fds=True) p.wait() image = glob('%s/*' % tmp) if image: image = image[0] with open(image, 'rb') as fd: data = fd.read() else: logger.debug('qlmanage did not create cover for %s', pdf) data = None shutil.rmtree(tmp) return data def page(pdf, page): tmp = tempfile.mkdtemp() if sys.platform == 'win32': pdf = get_short_path_name(pdf) cmd = [ 'pdftocairo', pdf, '-jpeg', '-f', str(page), '-l', str(page), '-scale-to', '1024', '-cropbox', os.path.join(tmp, 'page') ] if sys.platform == 'win32': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo) else: p = subprocess.Popen(cmd, close_fds=True) p.wait() image = glob('%s/*' % tmp) if image: image = image[0] with open(image, 'rb') as fd: data = fd.read() else: logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd)) data = None shutil.rmtree(tmp) return data ''' def page(pdf, page): image = tempfile.mkstemp('.jpg')[1] cmd = [ 'gs', '-q', '-dBATCH', '-dSAFER', '-dNOPAUSE', '-dNOPROMPT', '-dMaxBitmap=500000000', '-dAlignToPixels=0', '-dGridFitTT=2', '-sDEVICE=jpeg', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4', '-r72', '-dUseCropBox', '-dFirstPage=%d' % page, '-dLastPage=%d' % page, '-sOutputFile=%s' % image, pdf ] p = subprocess.Popen(cmd, close_fds=True) p.wait() with open(image, 'rb') as fd: data = fd.read() os.unlink(image) return data ''' def parse_tableofcontents(reader): titles = [] try: if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']: outlines = reader.trailer['/Root']['/Outlines'] if '/First' in outlines: title = outlines['/First'] while title: if '/Title' in title: titles.append(title['/Title']) if '/Next' in title: title = title['/Next'] else: title = None except: logger.debug('failed to parse pdf outline', exc_info=True) try: titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles] toc = '\n'.join(titles).strip() except: logger.debug('failed to decode outline', exc_info=True) titles = [] return toc def info(pdf): data = {} with open(pdf, 'rb') as fd: try: pdfreader = PdfFileReader(fd) data['pages'] = pdfreader.numPages if pdfreader.getIsEncrypted(): pdfreader.decrypt('') toc = parse_tableofcontents(pdfreader) if toc: data['tableofcontents'] = toc try: info = pdfreader.getDocumentInfo() except: info = None if info: for key in info: if info[key]: try: value = info[key] if len(value) == 1: value = value[0] if isinstance(value, bytes): value = value.decode('utf-16') data[key[1:].lower()] = value except: pass try: xmp = pdfreader.getXmpMetadata() except: xmp = None if xmp: for key in dir(xmp): if key.startswith('dc_'): try: value = getattr(xmp, key) except: continue if isinstance(value, dict) and 'x-default' in value: value = value['x-default'] elif isinstance(value, list): value = [v.strip() if isinstance(v, str) else v for v in value if v] value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value] if len(value) == 1: value = value[0] _key = key[3:] if value and _key not in data: data[_key] = value except: logger.debug('FAILED TO PARSE %s', pdf, exc_info=True) ''' cmd = ['pdfinfo', pdf] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() for line in stdout.strip().split('\n'): parts = line.split(':') key = parts[0].lower().strip() if key: data[key] = ':'.join(parts[1:]).strip() for key in data.keys(): if not data[key]: del data[key] ''' if 'identifier' in data: value = to_isbn13(data['identifier']) if value: data['isbn'] = value del data['identifier'] for key, value in data.items(): if isinstance(value, dict): value = ' '.join(list(value.values())) data[key] = value.strip() for key in list(data): if data[key] in ('Unknown',): del data[key] if key == 'language': data[key] = get_language(data[key]) if settings.server['extract_text']: text = extract_text(pdf) data['textsize'] = len(text) if 'isbn' not in data: isbn = extract_isbn(text) if isbn: data['isbn'] = isbn if 'date' in data and len(data['date']) == 8 and data['date'].isdigit(): d = data['date'] data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:]) if 'author' in data and isinstance(data['author'], str): data['author'] = [ox.normalize_name(data['author'])] if 'description' in data: data['description'] = ox.strip_tags(ox.decode_html(data['description'])).strip() if data.get('title', '').startswith('Microsoft Word'): for key in ('title', 'author', 'producer', 'creator'): if key in data: del data[key] for key in list(data): if isinstance(data[key], str) and not data[key].strip(): del data[key] return data ''' #possbile alternative with gs tmp = tempfile.mkstemp('.txt')[1] cmd = ['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=txtwrite', '-dFirstPage=3', '-dLastPage=5', '-sOutputFile=%s'%tmp, pdf] ''' def extract_text(pdf): if sys.platform == 'win32': pdf = get_short_path_name(pdf) if sys.platform == 'darwin': cmd = ['/usr/bin/mdimport', '-d2', pdf] else: cmd = ['pdftotext', pdf, '-'] if sys.platform == 'win32': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, startupinfo=startupinfo) else: p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() stdout = stdout.decode() stderr = stderr.decode() if sys.platform == 'darwin': if 'kMDItemTextContent' in stderr: stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2] else: stdout = '' return stdout.strip() def pypdf_extract_text(path): ''' slow and bad results ''' pdf = PdfFileReader(path) content = [] for i in range(0, pdf.getNumPages()): try: extracted_text = pdf.getPage(i).extractText() content.append(extracted_text) except: pass content = "\n".join(content).replace("\xa0", " ").strip() return content def extract_isbn(text): isbns = find_isbns(text) if isbns: return isbns[0]