cleanup toc and extract for all epubs
This commit is contained in:
parent
6061f2d754
commit
18a72db811
2 changed files with 44 additions and 12 deletions
|
@ -111,18 +111,36 @@ def info(epub):
|
||||||
data[key] = value.split(', ')
|
data[key] = value.split(', ')
|
||||||
else:
|
else:
|
||||||
data[key] = value
|
data[key] = value
|
||||||
guide = info.findall('{http://www.idpf.org/2007/opf}guide')
|
toc = [f for f in files if 'toc.ncx' in f]
|
||||||
if guide:
|
if toc:
|
||||||
for ref in guide[0].findall('{http://www.idpf.org/2007/opf}reference'):
|
try:
|
||||||
if ref.attrib.get('type') == 'toc':
|
_toc = ET.fromstring(z.read(toc[0]))
|
||||||
filename = unquote(ref.attrib['href']).split('#')[0]
|
nav_map = _toc.find('{http://www.daisy.org/z3986/2005/ncx/}navMap')
|
||||||
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
|
except:
|
||||||
if filename in files:
|
logger.debug('failed to parse toc', exc_info=True)
|
||||||
toc = z.read(filename)
|
nav_map = None
|
||||||
if toc:
|
if nav_map:
|
||||||
doc = lxml.html.document_fromstring(toc)
|
contents = []
|
||||||
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')])
|
for point in nav_map.findall('{http://www.daisy.org/z3986/2005/ncx/}navPoint'):
|
||||||
|
label = point.find('{http://www.daisy.org/z3986/2005/ncx/}navLabel')
|
||||||
|
if label:
|
||||||
|
txt = label.getchildren()[0].text
|
||||||
|
if txt:
|
||||||
|
contents.append(txt)
|
||||||
|
if contents:
|
||||||
|
data['tableofcontents'] = '\n'.join(contents).strip()
|
||||||
|
if not 'tableofcontents' in data:
|
||||||
|
guide = info.find('{http://www.idpf.org/2007/opf}guide')
|
||||||
|
if guide:
|
||||||
|
for ref in guide.findall('{http://www.idpf.org/2007/opf}reference'):
|
||||||
|
if ref.attrib.get('type') == 'toc':
|
||||||
|
filename = unquote(ref.attrib['href']).split('#')[0]
|
||||||
|
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
|
||||||
|
if filename in files:
|
||||||
|
toc = z.read(filename)
|
||||||
|
if toc:
|
||||||
|
doc = lxml.html.document_fromstring(toc)
|
||||||
|
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')]).strip()
|
||||||
if 'description' in data:
|
if 'description' in data:
|
||||||
data['description'] = strip_tags(decode_html(data['description']))
|
data['description'] = strip_tags(decode_html(data['description']))
|
||||||
text = extract_text(epub)
|
text = extract_text(epub)
|
||||||
|
|
14
oml/setup.py
14
oml/setup.py
|
@ -330,6 +330,20 @@ def upgrade_db(old, new=None):
|
||||||
if u.id != settings.USER_ID:
|
if u.id != settings.USER_ID:
|
||||||
Metadata.get_or_create(u.id, i.id, i.meta, False)
|
Metadata.get_or_create(u.id, i.id, i.meta, False)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
if old <= '20160111-617-206e39c' and new > '20160111-617-206e39c':
|
||||||
|
from item.models import File
|
||||||
|
import media
|
||||||
|
with db.session() as session:
|
||||||
|
for f in File.query:
|
||||||
|
if f.info.get('extension') == 'epub':
|
||||||
|
if not 'tableofcontents' in f.item.meta:
|
||||||
|
f.info = media.metadata(f.fullpath())
|
||||||
|
if 'tableofcontents' in f.info:
|
||||||
|
f.item.meta['tableofcontents'] = f.info['tableofcontents']
|
||||||
|
f.item.update()
|
||||||
|
session.add(f.item)
|
||||||
|
session.add(f)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
if old <= '20140527-120-3cb9819':
|
if old <= '20140527-120-3cb9819':
|
||||||
run_sql('CREATE INDEX ix_find_findvalue ON find (findvalue)')
|
run_sql('CREATE INDEX ix_find_findvalue ON find (findvalue)')
|
||||||
|
|
Loading…
Reference in a new issue