use pdftotext if available

This commit is contained in:
j 2019-02-01 17:36:03 +05:30
parent debe3823e2
commit c38d3a8b35

View file

@ -237,10 +237,13 @@ def info(pdf):
def extract_text(pdf): def extract_text(pdf):
if sys.platform == 'win32': if sys.platform == 'win32':
pdf = get_short_path_name(pdf) pdf = get_short_path_name(pdf)
if sys.platform == 'darwin':
cmd = ['/usr/bin/mdimport', '-d2', pdf]
else:
cmd = ['pdftotext', pdf, '-'] cmd = ['pdftotext', pdf, '-']
if sys.platform == 'darwin':
pdftotext = ['/usr/local/bin/pdftotext', pdf, '-']
if os.path.exists(pdftotext[0]):
cmd = pdftotext
else:
cmd = ['/usr/bin/mdimport', '-d2', pdf]
if sys.platform == 'win32': if sys.platform == 'win32':
startupinfo = subprocess.STARTUPINFO() startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
@ -251,7 +254,7 @@ def extract_text(pdf):
stdout, stderr = p.communicate() stdout, stderr = p.communicate()
stdout = stdout.decode() stdout = stdout.decode()
stderr = stderr.decode() stderr = stderr.decode()
if sys.platform == 'darwin': if sys.platform == 'darwin' and cmd[0] == '/usr/bin/mdimport':
if 'kMDItemTextContent' in stderr: if 'kMDItemTextContent' in stderr:
stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2] stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2]
else: else: