cleanup toc and extract for all epubs

This commit is contained in:
j 2016-01-12 00:23:11 +05:30
commit 18a72db811
2 changed files with 44 additions and 12 deletions

View file

@ -111,18 +111,36 @@ def info(epub):
data[key] = value.split(', ')
else:
data[key] = value
guide = info.findall('{http://www.idpf.org/2007/opf}guide')
if guide:
for ref in guide[0].findall('{http://www.idpf.org/2007/opf}reference'):
if ref.attrib.get('type') == 'toc':
filename = unquote(ref.attrib['href']).split('#')[0]
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
if filename in files:
toc = z.read(filename)
if toc:
doc = lxml.html.document_fromstring(toc)
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')])
toc = [f for f in files if 'toc.ncx' in f]
if toc:
try:
_toc = ET.fromstring(z.read(toc[0]))
nav_map = _toc.find('{http://www.daisy.org/z3986/2005/ncx/}navMap')
except:
logger.debug('failed to parse toc', exc_info=True)
nav_map = None
if nav_map:
contents = []
for point in nav_map.findall('{http://www.daisy.org/z3986/2005/ncx/}navPoint'):
label = point.find('{http://www.daisy.org/z3986/2005/ncx/}navLabel')
if label:
txt = label.getchildren()[0].text
if txt:
contents.append(txt)
if contents:
data['tableofcontents'] = '\n'.join(contents).strip()
if not 'tableofcontents' in data:
guide = info.find('{http://www.idpf.org/2007/opf}guide')
if guide:
for ref in guide.findall('{http://www.idpf.org/2007/opf}reference'):
if ref.attrib.get('type') == 'toc':
filename = unquote(ref.attrib['href']).split('#')[0]
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
if filename in files:
toc = z.read(filename)
if toc:
doc = lxml.html.document_fromstring(toc)
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')]).strip()
if 'description' in data:
data['description'] = strip_tags(decode_html(data['description']))
text = extract_text(epub)