extract tableofcontents from pdf

2016-01-12 14:57:08 +05:30 · 2016-01-12 14:57:08 +05:30 · de984a344e
commit de984a344e
parent c3ecb178cf
1 changed files with 27 additions and 0 deletions
--- a/oml/media/pdf.py
+++ b/oml/media/pdf.py
@ -96,6 +96,30 @@ def page(pdf, page):
    return data
 '''

+def parse_tableofcontents(reader):
+    titles = []
+    try:
+        if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']:
+            outlines = reader.trailer['/Root']['/Outlines']
+            if '/First' in outlines:
+                title = outlines['/First']
+                while title:
+                    if '/Title' in title:
+                        titles.append(title['/Title'])
+                    if '/Next' in title:
+                        title = title['/Next']
+                    else:
+                        title = None
+    except:
+        logger.debug('failed to parse pdf outline', exc_info=True)
+    try:
+        titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles]
+        toc = '\n'.join(titles).strip()
+    except:
+        logger.debug('failed to decode outline', exc_info=True)
+        titles = []
+    return toc
+
 def info(pdf):
    data = {}
    with open(pdf, 'rb') as fd:
@ -104,6 +128,9 @@ def info(pdf):
            data['pages'] = pdfreader.numPages
            if pdfreader.getIsEncrypted():
                pdfreader.decrypt('')
+            toc = parse_tableofcontents(pdfreader)
+            if toc:
+                data['tableofcontents'] = toc
            info = pdfreader.getDocumentInfo()
            if info:
                for key in info: