toc href can contain #
This commit is contained in:
parent
02e040d9f5
commit
bb09596566
1 changed files with 8 additions and 6 deletions
|
@ -89,7 +89,8 @@ def info(epub):
|
|||
except zipfile.BadZipFile:
|
||||
logger.debug('invalid epub file %s', epub)
|
||||
return data
|
||||
opf = [f.filename for f in z.filelist if f.filename.endswith('opf')]
|
||||
files = [f.filename for f in z.filelist]
|
||||
opf = [f for f in files if f.endswith('opf')]
|
||||
if opf:
|
||||
info = ET.fromstring(z.read(opf[0]))
|
||||
metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')
|
||||
|
@ -114,12 +115,13 @@ def info(epub):
|
|||
if guide:
|
||||
for ref in guide[0].findall('{http://www.idpf.org/2007/opf}reference'):
|
||||
if ref.attrib.get('type') == 'toc':
|
||||
filename = unquote(ref.attrib['href'])
|
||||
filename = unquote(ref.attrib['href']).split('#')[0]
|
||||
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
|
||||
toc = z.read(filename)
|
||||
if toc:
|
||||
doc = lxml.html.document_fromstring(toc)
|
||||
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')])
|
||||
if filename in files:
|
||||
toc = z.read(filename)
|
||||
if toc:
|
||||
doc = lxml.html.document_fromstring(toc)
|
||||
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')])
|
||||
|
||||
if 'description' in data:
|
||||
data['description'] = strip_tags(decode_html(data['description']))
|
||||
|
|
Loading…
Reference in a new issue