toc href can contain #
This commit is contained in:
parent
02e040d9f5
commit
bb09596566
1 changed files with 8 additions and 6 deletions
|
@ -89,7 +89,8 @@ def info(epub):
|
||||||
except zipfile.BadZipFile:
|
except zipfile.BadZipFile:
|
||||||
logger.debug('invalid epub file %s', epub)
|
logger.debug('invalid epub file %s', epub)
|
||||||
return data
|
return data
|
||||||
opf = [f.filename for f in z.filelist if f.filename.endswith('opf')]
|
files = [f.filename for f in z.filelist]
|
||||||
|
opf = [f for f in files if f.endswith('opf')]
|
||||||
if opf:
|
if opf:
|
||||||
info = ET.fromstring(z.read(opf[0]))
|
info = ET.fromstring(z.read(opf[0]))
|
||||||
metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')
|
metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')
|
||||||
|
@ -114,12 +115,13 @@ def info(epub):
|
||||||
if guide:
|
if guide:
|
||||||
for ref in guide[0].findall('{http://www.idpf.org/2007/opf}reference'):
|
for ref in guide[0].findall('{http://www.idpf.org/2007/opf}reference'):
|
||||||
if ref.attrib.get('type') == 'toc':
|
if ref.attrib.get('type') == 'toc':
|
||||||
filename = unquote(ref.attrib['href'])
|
filename = unquote(ref.attrib['href']).split('#')[0]
|
||||||
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
|
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
|
||||||
toc = z.read(filename)
|
if filename in files:
|
||||||
if toc:
|
toc = z.read(filename)
|
||||||
doc = lxml.html.document_fromstring(toc)
|
if toc:
|
||||||
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')])
|
doc = lxml.html.document_fromstring(toc)
|
||||||
|
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')])
|
||||||
|
|
||||||
if 'description' in data:
|
if 'description' in data:
|
||||||
data['description'] = strip_tags(decode_html(data['description']))
|
data['description'] = strip_tags(decode_html(data['description']))
|
||||||
|
|
Loading…
Reference in a new issue