toc href can contain #

This commit is contained in:
j 2016-01-11 19:25:33 +05:30
parent 02e040d9f5
commit bb09596566

View file

@ -89,7 +89,8 @@ def info(epub):
except zipfile.BadZipFile: except zipfile.BadZipFile:
logger.debug('invalid epub file %s', epub) logger.debug('invalid epub file %s', epub)
return data return data
opf = [f.filename for f in z.filelist if f.filename.endswith('opf')] files = [f.filename for f in z.filelist]
opf = [f for f in files if f.endswith('opf')]
if opf: if opf:
info = ET.fromstring(z.read(opf[0])) info = ET.fromstring(z.read(opf[0]))
metadata = info.findall('{http://www.idpf.org/2007/opf}metadata') metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')
@ -114,12 +115,13 @@ def info(epub):
if guide: if guide:
for ref in guide[0].findall('{http://www.idpf.org/2007/opf}reference'): for ref in guide[0].findall('{http://www.idpf.org/2007/opf}reference'):
if ref.attrib.get('type') == 'toc': if ref.attrib.get('type') == 'toc':
filename = unquote(ref.attrib['href']) filename = unquote(ref.attrib['href']).split('#')[0]
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
toc = z.read(filename) if filename in files:
if toc: toc = z.read(filename)
doc = lxml.html.document_fromstring(toc) if toc:
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')]) doc = lxml.html.document_fromstring(toc)
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')])
if 'description' in data: if 'description' in data:
data['description'] = strip_tags(decode_html(data['description'])) data['description'] = strip_tags(decode_html(data['description']))