From a24061518a034cf475754a351279f502db5ecfe7 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 16 Nov 2015 16:02:45 +0100 Subject: [PATCH] better epub parsing - dont fail if epubs are invalid zip - handle quoted filenames - dont fail if file is missing --- oml/media/epub.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/oml/media/epub.py b/oml/media/epub.py index fa4361a..a2ed310 100644 --- a/oml/media/epub.py +++ b/oml/media/epub.py @@ -7,6 +7,7 @@ import xml.etree.ElementTree as ET import zipfile from io import BytesIO import re +from urllib.parse import unquote from PIL import Image import stdnum.isbn @@ -18,34 +19,42 @@ logger = logging.getLogger('oml.media.epub') def cover(path): logger.debug('cover %s', path) - z = zipfile.ZipFile(path) data = None + try: + z = zipfile.ZipFile(path) + except zipfile.BadZipFile: + logger.debug('invalid epub file %s', path) + return data for f in z.filelist: if 'cover' in f.filename.lower() and f.filename.split('.')[-1] in ('jpg', 'jpeg', 'png'): logger.debug('using %s', f.filename) data = z.read(f.filename) break if not data: - opf = [f.filename for f in z.filelist if f.filename.endswith('opf')] + files = [f.filename for f in z.filelist] + opf = [f for f in files if f.endswith('opf')] if opf: info = ET.fromstring(z.read(opf[0])) manifest = info.findall('{http://www.idpf.org/2007/opf}manifest')[0] for e in manifest.getchildren(): if 'image' in e.attrib['media-type']: - filename = e.attrib['href'] + filename = unquote(e.attrib['href']) filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) - data = z.read(filename) - break + if filename in files: + data = z.read(filename) + break elif 'html' in e.attrib['media-type']: - filename = e.attrib['href'] + filename = unquote(e.attrib['href']) filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) html = z.read(filename).decode('utf-8') img = re.compile('