add meta.extract_text
This commit is contained in:
parent
67d75f7154
commit
f43fc6a172
2 changed files with 30 additions and 0 deletions
oml/media
|
@ -123,3 +123,17 @@ def metadata(f, from_=None):
|
||||||
data[key] = [data[key]] if data[key] else []
|
data[key] = [data[key]] if data[key] else []
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def extract_text(path):
|
||||||
|
ext = path.split('.')[-1]
|
||||||
|
text = ''
|
||||||
|
try:
|
||||||
|
if ext in ('epub', 'kepub'):
|
||||||
|
text = epub.extract_text(path)
|
||||||
|
elif ext == 'pdf':
|
||||||
|
text = pdf.extract_text(path)
|
||||||
|
elif ext == 'txt':
|
||||||
|
text = txt.extract_text(path)
|
||||||
|
except:
|
||||||
|
logger.debug('failed to extract text from %s', path, exc_info=1)
|
||||||
|
text = ''
|
||||||
|
return text
|
||||||
|
|
|
@ -229,6 +229,22 @@ def extract_text(pdf):
|
||||||
stdout = ''
|
stdout = ''
|
||||||
return stdout.strip()
|
return stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def pypdf_extract_text(path):
|
||||||
|
'''
|
||||||
|
slow and bad results
|
||||||
|
'''
|
||||||
|
pdf = PdfFileReader(path)
|
||||||
|
content = []
|
||||||
|
for i in range(0, pdf.getNumPages()):
|
||||||
|
try:
|
||||||
|
extracted_text = pdf.getPage(i).extractText()
|
||||||
|
content.append(extracted_text)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
content = "\n".join(content).replace("\xa0", " ").strip()
|
||||||
|
return content
|
||||||
|
|
||||||
def extract_isbn(text):
|
def extract_isbn(text):
|
||||||
isbns = find_isbns(text)
|
isbns = find_isbns(text)
|
||||||
if isbns:
|
if isbns:
|
||||||
|
|
Loading…
Add table
Reference in a new issue