From d866b4de9171d4a356bf3711ea38d02ead8429d5 Mon Sep 17 00:00:00 2001 From: j Date: Wed, 6 Jan 2016 18:40:23 +0530 Subject: [PATCH] parse epubs without manifest --- oml/media/epub.py | 114 +++++++++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 53 deletions(-) diff --git a/oml/media/epub.py b/oml/media/epub.py index 6d8b6fb..d8ddea8 100644 --- a/oml/media/epub.py +++ b/oml/media/epub.py @@ -39,43 +39,49 @@ def cover(path): if opf: #logger.debug('opf: %s', z.read(opf[0]).decode()) info = ET.fromstring(z.read(opf[0])) - metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')[0] - manifest = info.findall('{http://www.idpf.org/2007/opf}manifest')[0] - for e in metadata.getchildren(): - if e.tag == '{http://www.idpf.org/2007/opf}meta' and e.attrib.get('name') == 'cover': - cover_id = e.attrib['content'] - for e in manifest.getchildren(): - if e.attrib['id'] == cover_id: - filename = unquote(e.attrib['href']) - filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) - if filename in files: - return use(filename) - images = [e for e in manifest.getchildren() if 'image' in e.attrib['media-type']] - if images: - image_data = [] - for e in images: - filename = unquote(e.attrib['href']) - filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) - if filename in files: - image_data.append((filename, z.read(filename))) - if image_data: - image_data.sort(key=lambda i: len(i[1])) - data = image_data[-1][1] - logger.debug('using %s', image_data[-1][0]) - return data - for e in manifest.getchildren(): - if 'html' in e.attrib['media-type']: - filename = unquote(e.attrib['href']) - filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) - html = z.read(filename).decode('utf-8', 'ignore') - img = re.compile('