better pdf parsing

This commit is contained in:
j 2015-12-24 20:30:14 +05:30
parent ccd3b166d0
commit ebc0b95022

View file

@ -8,7 +8,9 @@ import subprocess
import os import os
import shutil import shutil
from glob import glob from glob import glob
from datetime import datetime
import ox
from PyPDF2 import PdfFileReader from PyPDF2 import PdfFileReader
import stdnum.isbn import stdnum.isbn
@ -108,9 +110,15 @@ def info(pdf):
for key in info: for key in info:
if info[key]: if info[key]:
try: try:
if isinstance(info[key], bytes): value = info[key]
info[key] = info[key].decode('utf-16') if len(value) == 1:
data[key[1:].lower()] = info[key] value = value[0]
if isinstance(value, bytes):
value = value.decode('utf-16')
if value in ('Unknown',):
value = None
if value:
data[key[1:].lower()] = value
except: except:
pass pass
@ -122,9 +130,16 @@ def info(pdf):
if isinstance(value, dict) and 'x-default' in value: if isinstance(value, dict) and 'x-default' in value:
value = value['x-default'] value = value['x-default']
elif isinstance(value, list): elif isinstance(value, list):
value = [v.strip() if isinstance(v, str) else v for v in value if v.strip()] value = [v.strip() if isinstance(v, str) else v for v in value if v]
value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value]
if len(value) == 1:
value = value[0]
if value in ('Unknown',):
value = None
_key = key[3:] _key = key[3:]
if value and _key not in data: if value and _key not in data:
if _key == 'language':
value = ox.iso.codeToLang(value)
data[_key] = value data[_key] = value
except: except:
logger.debug('FAILED TO PARSE %s', pdf, exc_info=1) logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)