extract tableofcontents from pdf

2016-01-12 14:57:08 +05:30 · 2016-01-12 14:57:08 +05:30 · de984a344e
commit de984a344e
parent c3ecb178cf
1 changed files with 27 additions and 0 deletions
--- a/oml/media/pdf.py
+++ b/oml/media/pdf.py
@ -96,6 +96,30 @@ def page(pdf, page):
    return data
 '''
 def parse_tableofcontents(reader):
    titles = []
    try:
        if '/Root' in reader.trailer and '/Outlines' in reader.trailer['/Root']:
            outlines = reader.trailer['/Root']['/Outlines']
            if '/First' in outlines:
                title = outlines['/First']
                while title:
                    if '/Title' in title:
                        titles.append(title['/Title'])
                    if '/Next' in title:
                        title = title['/Next']
                    else:
                        title = None
    except:
        logger.debug('failed to parse pdf outline', exc_info=True)
    try:
        titles = [title.decode('utf-8','ignore').strip() if isinstance(title, bytes) else title for title in titles]
        toc = '\n'.join(titles).strip()
    except:
        logger.debug('failed to decode outline', exc_info=True)
        titles = []
    return toc
 def info(pdf):
    data = {}
    with open(pdf, 'rb') as fd:
@ -104,6 +128,9 @@ def info(pdf):
            data['pages'] = pdfreader.numPages
            if pdfreader.getIsEncrypted():
                pdfreader.decrypt('')
            toc = parse_tableofcontents(pdfreader)
            if toc:
                data['tableofcontents'] = toc
            info = pdfreader.getDocumentInfo()
            if info:
                for key in info: