better pdf parsing
This commit is contained in:
parent
ccd3b166d0
commit
ebc0b95022
1 changed files with 19 additions and 4 deletions
|
@ -8,7 +8,9 @@ import subprocess
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import ox
|
||||||
from PyPDF2 import PdfFileReader
|
from PyPDF2 import PdfFileReader
|
||||||
import stdnum.isbn
|
import stdnum.isbn
|
||||||
|
|
||||||
|
@ -108,9 +110,15 @@ def info(pdf):
|
||||||
for key in info:
|
for key in info:
|
||||||
if info[key]:
|
if info[key]:
|
||||||
try:
|
try:
|
||||||
if isinstance(info[key], bytes):
|
value = info[key]
|
||||||
info[key] = info[key].decode('utf-16')
|
if len(value) == 1:
|
||||||
data[key[1:].lower()] = info[key]
|
value = value[0]
|
||||||
|
if isinstance(value, bytes):
|
||||||
|
value = value.decode('utf-16')
|
||||||
|
if value in ('Unknown',):
|
||||||
|
value = None
|
||||||
|
if value:
|
||||||
|
data[key[1:].lower()] = value
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -122,9 +130,16 @@ def info(pdf):
|
||||||
if isinstance(value, dict) and 'x-default' in value:
|
if isinstance(value, dict) and 'x-default' in value:
|
||||||
value = value['x-default']
|
value = value['x-default']
|
||||||
elif isinstance(value, list):
|
elif isinstance(value, list):
|
||||||
value = [v.strip() if isinstance(v, str) else v for v in value if v.strip()]
|
value = [v.strip() if isinstance(v, str) else v for v in value if v]
|
||||||
|
value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value]
|
||||||
|
if len(value) == 1:
|
||||||
|
value = value[0]
|
||||||
|
if value in ('Unknown',):
|
||||||
|
value = None
|
||||||
_key = key[3:]
|
_key = key[3:]
|
||||||
if value and _key not in data:
|
if value and _key not in data:
|
||||||
|
if _key == 'language':
|
||||||
|
value = ox.iso.codeToLang(value)
|
||||||
data[_key] = value
|
data[_key] = value
|
||||||
except:
|
except:
|
||||||
logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)
|
logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)
|
||||||
|
|
Loading…
Reference in a new issue