extract tableofcontents from pdf
This commit is contained in:
parent
c3ecb178cf
commit
de984a344e
1 changed files with 27 additions and 0 deletions
|
@ -96,6 +96,30 @@ def page(pdf, page):
|
||||||
return data
|
return data
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
def parse_tableofcontents(reader):
|
||||||
|
titles = []
|
||||||
|
try:
|
||||||
|
if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']:
|
||||||
|
outlines = reader.trailer['/Root']['/Outlines']
|
||||||
|
if '/First' in outlines:
|
||||||
|
title = outlines['/First']
|
||||||
|
while title:
|
||||||
|
if '/Title' in title:
|
||||||
|
titles.append(title['/Title'])
|
||||||
|
if '/Next' in title:
|
||||||
|
title = title['/Next']
|
||||||
|
else:
|
||||||
|
title = None
|
||||||
|
except:
|
||||||
|
logger.debug('failed to parse pdf outline', exc_info=True)
|
||||||
|
try:
|
||||||
|
titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles]
|
||||||
|
toc = '\n'.join(titles).strip()
|
||||||
|
except:
|
||||||
|
logger.debug('failed to decode outline', exc_info=True)
|
||||||
|
titles = []
|
||||||
|
return toc
|
||||||
|
|
||||||
def info(pdf):
|
def info(pdf):
|
||||||
data = {}
|
data = {}
|
||||||
with open(pdf, 'rb') as fd:
|
with open(pdf, 'rb') as fd:
|
||||||
|
@ -104,6 +128,9 @@ def info(pdf):
|
||||||
data['pages'] = pdfreader.numPages
|
data['pages'] = pdfreader.numPages
|
||||||
if pdfreader.getIsEncrypted():
|
if pdfreader.getIsEncrypted():
|
||||||
pdfreader.decrypt('')
|
pdfreader.decrypt('')
|
||||||
|
toc = parse_tableofcontents(pdfreader)
|
||||||
|
if toc:
|
||||||
|
data['tableofcontents'] = toc
|
||||||
info = pdfreader.getDocumentInfo()
|
info = pdfreader.getDocumentInfo()
|
||||||
if info:
|
if info:
|
||||||
for key in info:
|
for key in info:
|
||||||
|
|
Loading…
Reference in a new issue