cleanup pdf
This commit is contained in:
parent
935f8d7f2b
commit
c5afc46af1
1 changed files with 8 additions and 9 deletions
|
@ -115,9 +115,6 @@ def info(pdf):
|
||||||
value = value[0]
|
value = value[0]
|
||||||
if isinstance(value, bytes):
|
if isinstance(value, bytes):
|
||||||
value = value.decode('utf-16')
|
value = value.decode('utf-16')
|
||||||
if value in ('Unknown',):
|
|
||||||
value = None
|
|
||||||
if value:
|
|
||||||
data[key[1:].lower()] = value
|
data[key[1:].lower()] = value
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
@ -134,15 +131,12 @@ def info(pdf):
|
||||||
value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value]
|
value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value]
|
||||||
if len(value) == 1:
|
if len(value) == 1:
|
||||||
value = value[0]
|
value = value[0]
|
||||||
if value in ('Unknown',):
|
|
||||||
value = None
|
|
||||||
_key = key[3:]
|
_key = key[3:]
|
||||||
if value and _key not in data:
|
if value and _key not in data:
|
||||||
if _key == 'language':
|
|
||||||
value = ox.iso.codeToLang(value)
|
|
||||||
data[_key] = value
|
data[_key] = value
|
||||||
except:
|
except:
|
||||||
logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)
|
logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
cmd = ['pdfinfo', pdf]
|
cmd = ['pdfinfo', pdf]
|
||||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
|
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
|
||||||
|
@ -164,7 +158,12 @@ def info(pdf):
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
value = ' '.join(list(value.values()))
|
value = ' '.join(list(value.values()))
|
||||||
data[key] = value
|
data[key] = value.strip()
|
||||||
|
for key in list(data):
|
||||||
|
if data[key] in ('Unknown',):
|
||||||
|
del data[key]
|
||||||
|
if key == 'language':
|
||||||
|
data[key] = ox.iso.codeToLang(data[key])
|
||||||
text = extract_text(pdf)
|
text = extract_text(pdf)
|
||||||
data['textsize'] = len(text)
|
data['textsize'] = len(text)
|
||||||
if settings.server['extract_text']:
|
if settings.server['extract_text']:
|
||||||
|
|
Loading…
Reference in a new issue