diff --git a/oml/media/pdf.py b/oml/media/pdf.py index c27d32f..da2ef1d 100644 --- a/oml/media/pdf.py +++ b/oml/media/pdf.py @@ -96,6 +96,30 @@ def page(pdf, page): return data ''' +def parse_tableofcontents(reader): + titles = [] + try: + if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']: + outlines = reader.trailer['/Root']['/Outlines'] + if '/First' in outlines: + title = outlines['/First'] + while title: + if '/Title' in title: + titles.append(title['/Title']) + if '/Next' in title: + title = title['/Next'] + else: + title = None + except: + logger.debug('failed to parse pdf outline', exc_info=True) + try: + titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles] + toc = '\n'.join(titles).strip() + except: + logger.debug('failed to decode outline', exc_info=True) + titles = [] + return toc + def info(pdf): data = {} with open(pdf, 'rb') as fd: @@ -104,6 +128,9 @@ def info(pdf): data['pages'] = pdfreader.numPages if pdfreader.getIsEncrypted(): pdfreader.decrypt('') + toc = parse_tableofcontents(pdfreader) + if toc: + data['tableofcontents'] = toc info = pdfreader.getDocumentInfo() if info: for key in info: