extract tableofcontents from pdf

This commit is contained in:
j 2016-01-12 14:57:08 +05:30
parent c3ecb178cf
commit de984a344e
1 changed files with 27 additions and 0 deletions

View File

@ -96,6 +96,30 @@ def page(pdf, page):
return data
'''
def parse_tableofcontents(reader):
titles = []
try:
if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']:
outlines = reader.trailer['/Root']['/Outlines']
if '/First' in outlines:
title = outlines['/First']
while title:
if '/Title' in title:
titles.append(title['/Title'])
if '/Next' in title:
title = title['/Next']
else:
title = None
except:
logger.debug('failed to parse pdf outline', exc_info=True)
try:
titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles]
toc = '\n'.join(titles).strip()
except:
logger.debug('failed to decode outline', exc_info=True)
titles = []
return toc
def info(pdf):
data = {}
with open(pdf, 'rb') as fd:
@ -104,6 +128,9 @@ def info(pdf):
data['pages'] = pdfreader.numPages
if pdfreader.getIsEncrypted():
pdfreader.decrypt('')
toc = parse_tableofcontents(pdfreader)
if toc:
data['tableofcontents'] = toc
info = pdfreader.getDocumentInfo()
if info:
for key in info: