From c38d3a8b3523b0fff74d9a4456887f22f5087671 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 1 Feb 2019 17:36:03 +0530 Subject: [PATCH] use pdftotext if available --- oml/media/pdf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/oml/media/pdf.py b/oml/media/pdf.py index bc342d0..75313b2 100644 --- a/oml/media/pdf.py +++ b/oml/media/pdf.py @@ -237,10 +237,13 @@ def info(pdf): def extract_text(pdf): if sys.platform == 'win32': pdf = get_short_path_name(pdf) + cmd = ['pdftotext', pdf, '-'] if sys.platform == 'darwin': - cmd = ['/usr/bin/mdimport', '-d2', pdf] - else: - cmd = ['pdftotext', pdf, '-'] + pdftotext = ['/usr/local/bin/pdftotext', pdf, '-'] + if os.path.exists(pdftotext[0]): + cmd = pdftotext + else: + cmd = ['/usr/bin/mdimport', '-d2', pdf] if sys.platform == 'win32': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW @@ -251,7 +254,7 @@ def extract_text(pdf): stdout, stderr = p.communicate() stdout = stdout.decode() stderr = stderr.decode() - if sys.platform == 'darwin': + if sys.platform == 'darwin' and cmd[0] == '/usr/bin/mdimport': if 'kMDItemTextContent' in stderr: stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2] else: