From 18a72db81108e5adccc359cd630a4cf183a24bf0 Mon Sep 17 00:00:00 2001 From: j Date: Tue, 12 Jan 2016 00:23:11 +0530 Subject: [PATCH] cleanup toc and extract for all epubs --- oml/media/epub.py | 42 ++++++++++++++++++++++++++++++------------ oml/setup.py | 14 ++++++++++++++ 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/oml/media/epub.py b/oml/media/epub.py index ca4a6b2..ed5c2da 100644 --- a/oml/media/epub.py +++ b/oml/media/epub.py @@ -111,18 +111,36 @@ def info(epub): data[key] = value.split(', ') else: data[key] = value - guide = info.findall('{http://www.idpf.org/2007/opf}guide') - if guide: - for ref in guide[0].findall('{http://www.idpf.org/2007/opf}reference'): - if ref.attrib.get('type') == 'toc': - filename = unquote(ref.attrib['href']).split('#')[0] - filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) - if filename in files: - toc = z.read(filename) - if toc: - doc = lxml.html.document_fromstring(toc) - data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')]) - + toc = [f for f in files if 'toc.ncx' in f] + if toc: + try: + _toc = ET.fromstring(z.read(toc[0])) + nav_map = _toc.find('{http://www.daisy.org/z3986/2005/ncx/}navMap') + except: + logger.debug('failed to parse toc', exc_info=True) + nav_map = None + if nav_map: + contents = [] + for point in nav_map.findall('{http://www.daisy.org/z3986/2005/ncx/}navPoint'): + label = point.find('{http://www.daisy.org/z3986/2005/ncx/}navLabel') + if label: + txt = label.getchildren()[0].text + if txt: + contents.append(txt) + if contents: + data['tableofcontents'] = '\n'.join(contents).strip() + if not 'tableofcontents' in data: + guide = info.find('{http://www.idpf.org/2007/opf}guide') + if guide: + for ref in guide.findall('{http://www.idpf.org/2007/opf}reference'): + if ref.attrib.get('type') == 'toc': + filename = unquote(ref.attrib['href']).split('#')[0] + filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) + if filename in files: + toc = z.read(filename) + if toc: + doc = lxml.html.document_fromstring(toc) + data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')]).strip() if 'description' in data: data['description'] = strip_tags(decode_html(data['description'])) text = extract_text(epub) diff --git a/oml/setup.py b/oml/setup.py index a205519..549e5d2 100644 --- a/oml/setup.py +++ b/oml/setup.py @@ -330,6 +330,20 @@ def upgrade_db(old, new=None): if u.id != settings.USER_ID: Metadata.get_or_create(u.id, i.id, i.meta, False) session.commit() + if old <= '20160111-617-206e39c' and new > '20160111-617-206e39c': + from item.models import File + import media + with db.session() as session: + for f in File.query: + if f.info.get('extension') == 'epub': + if not 'tableofcontents' in f.item.meta: + f.info = media.metadata(f.fullpath()) + if 'tableofcontents' in f.info: + f.item.meta['tableofcontents'] = f.info['tableofcontents'] + f.item.update() + session.add(f.item) + session.add(f) + session.commit() if old <= '20140527-120-3cb9819': run_sql('CREATE INDEX ix_find_findvalue ON find (findvalue)')