extract tableofcontents from pdf
This commit is contained in:
parent
c3ecb178cf
commit
de984a344e
1 changed files with 27 additions and 0 deletions
|
@ -96,6 +96,30 @@ def page(pdf, page):
|
|||
return data
|
||||
'''
|
||||
|
||||
def parse_tableofcontents(reader):
|
||||
titles = []
|
||||
try:
|
||||
if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']:
|
||||
outlines = reader.trailer['/Root']['/Outlines']
|
||||
if '/First' in outlines:
|
||||
title = outlines['/First']
|
||||
while title:
|
||||
if '/Title' in title:
|
||||
titles.append(title['/Title'])
|
||||
if '/Next' in title:
|
||||
title = title['/Next']
|
||||
else:
|
||||
title = None
|
||||
except:
|
||||
logger.debug('failed to parse pdf outline', exc_info=True)
|
||||
try:
|
||||
titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles]
|
||||
toc = '\n'.join(titles).strip()
|
||||
except:
|
||||
logger.debug('failed to decode outline', exc_info=True)
|
||||
titles = []
|
||||
return toc
|
||||
|
||||
def info(pdf):
|
||||
data = {}
|
||||
with open(pdf, 'rb') as fd:
|
||||
|
@ -104,6 +128,9 @@ def info(pdf):
|
|||
data['pages'] = pdfreader.numPages
|
||||
if pdfreader.getIsEncrypted():
|
||||
pdfreader.decrypt('')
|
||||
toc = parse_tableofcontents(pdfreader)
|
||||
if toc:
|
||||
data['tableofcontents'] = toc
|
||||
info = pdfreader.getDocumentInfo()
|
||||
if info:
|
||||
for key in info:
|
||||
|
|
Loading…
Reference in a new issue