add meta.extract_text

2016-01-19 20:48:25 +05:30 · 2016-01-19 20:48:25 +05:30 · f43fc6a172
commit f43fc6a172
parent 67d75f7154
2 changed files with 30 additions and 0 deletions
--- a/oml/media/init.py
+++ b/oml/media/init.py
@ -123,3 +123,17 @@ def metadata(f, from_=None):
            data[key] = [data[key]] if data[key] else []
    return data
 def extract_text(path):
    ext = path.split('.')[-1]
    text = ''
    try:
        if ext in ('epub', 'kepub'):
            text = epub.extract_text(path)
        elif ext == 'pdf':
            text = pdf.extract_text(path)
        elif ext == 'txt':
            text = txt.extract_text(path)
    except:
        logger.debug('failed to extract text from %s', path, exc_info=1)
        text = ''
    return text
--- a/oml/media/pdf.py
+++ b/oml/media/pdf.py
@ -229,6 +229,22 @@ def extract_text(pdf):
            stdout = ''
    return stdout.strip()
 def pypdf_extract_text(path):
    '''
        slow and bad results
    '''
    pdf = PdfFileReader(path)
    content = []
    for i in range(0, pdf.getNumPages()):
        try:
            extracted_text = pdf.getPage(i).extractText()
            content.append(extracted_text)
        except:
            pass
    content = "\n".join(content).replace("\xa0", " ").strip()
    return content
 def extract_isbn(text):
    isbns = find_isbns(text)
    if isbns: