# -*- coding: utf-8 -*- import sys import tempfile import subprocess import os import shutil from glob import glob from datetime import datetime from PyPDF2 import PdfFileReader from PIL import Image import ox import settings from utils import get_language, to_isbn13, find_isbns, get_short_path_name import logging logger = logging.getLogger(__name__) def cover(pdf): if sys.platform == 'darwin': return ql_cover(pdf) else: return page(pdf, 1) def ql_cover(pdf, size=1024): tmp = tempfile.mkdtemp() cmd = [ 'qlmanage', '-t', '-s', str(size), '-o', tmp, pdf ] p = subprocess.Popen(cmd, close_fds=True) p.wait() image = glob('%s/*' % tmp) if image: image = image[0] with open(image, 'rb') as fd: data = fd.read() else: logger.debug('qlmanage did not create cover for %s', pdf) data = None shutil.rmtree(tmp) return data def page(pdf, page, size=1024): tmp = tempfile.mkdtemp() if sys.platform == 'win32': pdf = get_short_path_name(pdf) cmd = [ 'pdftocairo', pdf, '-jpeg', '-f', str(page), '-l', str(page), '-scale-to', str(size), '-cropbox', os.path.join(tmp, 'page') ] if sys.platform == 'win32': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo) else: p = subprocess.Popen(cmd, close_fds=True) p.wait() image = glob('%s/*' % tmp) if image: image = image[0] with open(image, 'rb') as fd: data = fd.read() else: logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd)) data = None shutil.rmtree(tmp) return data def crop(pdf, page, left, top, right, bottom): size = 2048 tmp = tempfile.mkdtemp() if sys.platform == 'win32': pdf = get_short_path_name(pdf) cmd = [ 'pdftocairo', pdf, '-jpeg', '-f', str(page), '-l', str(page), '-scale-to', str(size), '-cropbox', os.path.join(tmp, 'page') ] if sys.platform == 'win32': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo) else: p = subprocess.Popen(cmd, close_fds=True) p.wait() image = glob('%s/*' % tmp) if image: image = image[0] crop = [int(p) for p in (left, top, right, bottom)] img = Image.open(image).crop(crop) img.save(image) with open(image, 'rb') as fd: data = fd.read() else: logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd)) data = None shutil.rmtree(tmp) return data ''' def page(pdf, page): image = tempfile.mkstemp('.jpg')[1] cmd = [ 'gs', '-q', '-dBATCH', '-dSAFER', '-dNOPAUSE', '-dNOPROMPT', '-dMaxBitmap=500000000', '-dAlignToPixels=0', '-dGridFitTT=2', '-sDEVICE=jpeg', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4', '-r72', '-dUseCropBox', '-dFirstPage=%d' % page, '-dLastPage=%d' % page, '-sOutputFile=%s' % image, pdf ] p = subprocess.Popen(cmd, close_fds=True) p.wait() with open(image, 'rb') as fd: data = fd.read() os.unlink(image) return data ''' def parse_tableofcontents(reader): titles = [] try: if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']: outlines = reader.trailer['/Root']['/Outlines'] if '/First' in outlines: title = outlines['/First'] while title: if '/Title' in title: titles.append(title['/Title']) if '/Next' in title: title = title['/Next'] else: title = None except: logger.debug('failed to parse pdf outline', exc_info=True) try: titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles] toc = '\n'.join(titles).strip() except: logger.debug('failed to decode outline', exc_info=True) titles = [] return toc def info(pdf): data = {} with open(pdf, 'rb') as fd: try: pdfreader = PdfFileReader(fd) data['pages'] = pdfreader.numPages if pdfreader.getIsEncrypted(): pdfreader.decrypt('') toc = parse_tableofcontents(pdfreader) if toc: data['tableofcontents'] = toc try: info = pdfreader.getDocumentInfo() except: info = None if info: for key in info: if info[key]: try: value = info[key] if len(value) == 1: value = value[0] if isinstance(value, bytes): value = value.decode('utf-16') data[key[1:].lower()] = value except: pass try: xmp = pdfreader.getXmpMetadata() except: xmp = None if xmp: for key in dir(xmp): if key.startswith('dc_'): try: value = getattr(xmp, key) except: continue if isinstance(value, dict) and 'x-default' in value: value = value['x-default'] elif isinstance(value, list): value = [v.strip() if isinstance(v, str) else v for v in value if v] value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value] if len(value) == 1: value = value[0] _key = key[3:] if value and _key not in data: data[_key] = value except: logger.debug('FAILED TO PARSE %s', pdf, exc_info=True) ''' cmd = ['pdfinfo', pdf] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() for line in stdout.strip().split('\n'): parts = line.split(':') key = parts[0].lower().strip() if key: data[key] = ':'.join(parts[1:]).strip() for key in data.keys(): if not data[key]: del data[key] ''' if 'identifier' in data: value = to_isbn13(data['identifier']) if value: data['isbn'] = value del data['identifier'] for key, value in data.items(): if isinstance(value, dict): value = ' '.join(list(value.values())) data[key] = value.strip() for key in list(data): if data[key] in ('Unknown',): del data[key] if key == 'language': data[key] = get_language(data[key]) if settings.server['extract_text']: text = extract_text(pdf) data['textsize'] = len(text) if 'isbn' not in data: isbn = extract_isbn(text) if isbn: data['isbn'] = isbn if 'date' in data and len(data['date']) == 8 and data['date'].isdigit(): d = data['date'] data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:]) if 'author' in data and isinstance(data['author'], str): data['author'] = [ox.normalize_name(data['author'])] if 'description' in data: data['description'] = ox.strip_tags(ox.decode_html(data['description'])).strip() if data.get('title', '').startswith('Microsoft Word'): for key in ('title', 'author', 'producer', 'creator'): if key in data: del data[key] for key in list(data): if isinstance(data[key], str) and not data[key].strip(): del data[key] return data ''' #possbile alternative with gs tmp = tempfile.mkstemp('.txt')[1] cmd = ['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=txtwrite', '-dFirstPage=3', '-dLastPage=5', '-sOutputFile=%s'%tmp, pdf] ''' def extract_text(pdf): if sys.platform == 'win32': pdf = get_short_path_name(pdf) cmd = ['pdftotext', pdf, '-'] if sys.platform == 'darwin': pdftotext = ['/usr/local/bin/pdftotext', pdf, '-'] if os.path.exists(pdftotext[0]): cmd = pdftotext else: cmd = ['/usr/bin/mdimport', '-d2', pdf] if sys.platform == 'win32': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, startupinfo=startupinfo) else: p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() stdout = stdout.decode() stderr = stderr.decode() if sys.platform == 'darwin' and cmd[0] == '/usr/bin/mdimport': if 'kMDItemTextContent' in stderr: stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2] else: stdout = '' return stdout.strip() def pypdf_extract_text(path): ''' slow and bad results ''' pdf = PdfFileReader(path) content = [] for i in range(0, pdf.getNumPages()): try: extracted_text = pdf.getPage(i).extractText() content.append(extracted_text) except: pass content = "\n".join(content).replace("\xa0", " ").strip() return content def extract_isbn(text): isbns = find_isbns(text) if isbns: return isbns[0]