add meta.extract_text

This commit is contained in:
j 2016-01-19 20:48:25 +05:30
parent 67d75f7154
commit f43fc6a172
2 changed files with 30 additions and 0 deletions

View file

@ -123,3 +123,17 @@ def metadata(f, from_=None):
data[key] = [data[key]] if data[key] else []
return data
def extract_text(path):
ext = path.split('.')[-1]
text = ''
try:
if ext in ('epub', 'kepub'):
text = epub.extract_text(path)
elif ext == 'pdf':
text = pdf.extract_text(path)
elif ext == 'txt':
text = txt.extract_text(path)
except:
logger.debug('failed to extract text from %s', path, exc_info=1)
text = ''
return text

View file

@ -229,6 +229,22 @@ def extract_text(pdf):
stdout = ''
return stdout.strip()
def pypdf_extract_text(path):
'''
slow and bad results
'''
pdf = PdfFileReader(path)
content = []
for i in range(0, pdf.getNumPages()):
try:
extracted_text = pdf.getPage(i).extractText()
content.append(extracted_text)
except:
pass
content = "\n".join(content).replace("\xa0", " ").strip()
return content
def extract_isbn(text):
isbns = find_isbns(text)
if isbns: