add meta.extract_text
This commit is contained in:
parent
67d75f7154
commit
f43fc6a172
2 changed files with 30 additions and 0 deletions
|
|
@ -229,6 +229,22 @@ def extract_text(pdf):
|
|||
stdout = ''
|
||||
return stdout.strip()
|
||||
|
||||
|
||||
def pypdf_extract_text(path):
|
||||
'''
|
||||
slow and bad results
|
||||
'''
|
||||
pdf = PdfFileReader(path)
|
||||
content = []
|
||||
for i in range(0, pdf.getNumPages()):
|
||||
try:
|
||||
extracted_text = pdf.getPage(i).extractText()
|
||||
content.append(extracted_text)
|
||||
except:
|
||||
pass
|
||||
content = "\n".join(content).replace("\xa0", " ").strip()
|
||||
return content
|
||||
|
||||
def extract_isbn(text):
|
||||
isbns = find_isbns(text)
|
||||
if isbns:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue