add meta.extract_text

This commit is contained in:
j 2016-01-19 20:48:25 +05:30
commit f43fc6a172
2 changed files with 30 additions and 0 deletions

View file

@ -229,6 +229,22 @@ def extract_text(pdf):
stdout = ''
return stdout.strip()
def pypdf_extract_text(path):
'''
slow and bad results
'''
pdf = PdfFileReader(path)
content = []
for i in range(0, pdf.getNumPages()):
try:
extracted_text = pdf.getPage(i).extractText()
content.append(extracted_text)
except:
pass
content = "\n".join(content).replace("\xa0", " ").strip()
return content
def extract_isbn(text):
isbns = find_isbns(text)
if isbns: