openmedialibrary/oml/media/__init__.py

# -*- coding: utf-8 -*-


import base64
import hashlib
import os
import codecs

import ox

from . import pdf
from . import cbr
cbz = cbr
from . import epub
from . import txt
from . import opf

from meta.utils import decode_html_data, strip_tags_data, to_isbn13
import settings

import logging
logger = logging.getLogger(__name__)

def get_id(f=None, data=None):
    if data:
        return base64.b32encode(hashlib.sha1(data).digest()).decode()
    else:
        return base64.b32encode(codecs.decode(ox.sha1sum(f, cached=True), 'hex')).decode()

def metadata(f, from_=None):
    ext = f.split('.')[-1].lower()
    data = {}
    data['extension'] = ext
    data['size'] = os.stat(f).st_size

    try:
        if ext in ('cbr', 'cbz'):
            info = cbr.info(f)
        elif ext in ('epub', 'kepub'):
            info = epub.info(f)
            data['extension'] = 'epub'
        elif ext == 'pdf':
            info = pdf.info(f)
        elif ext == 'txt':
            info = txt.info(f)
        else:
            logger.error('unknown extension %s', f, ext)
            info = {}
    except:
        logger.debug('failed to load %s info from %s', ext, f, exc_info=True)
        info = {}

    opf_info = {}
    metadata_opf = os.path.join(os.path.dirname(from_ or f), 'metadata.opf')
    if os.path.exists(metadata_opf):
        opf_info = opf.info(metadata_opf)
    for key in (
        'author',
        'categories',
        'cover',
        'date',
        'description',
        'edition',
        'isbn',
        'language',
        'pages',
        'place',
        'publisher',
        'series',
        'tableofcontents',
        'title',

        'textsize',
    ):
        if key in info:
            value = info[key]
            if isinstance(value, bytes):
                try:
                    value = value.decode('utf-8')
                except:
                    value = None
            if value:
                data[key] = info[key]
        if key in opf_info:
            data[key] = opf_info[key]
        if key in data:
            if isinstance(data[key], str):
                data[key] = data[key].replace('\x00', '')
            elif isinstance(data[key], list):
                data[key] = [e.replace('\x00', '') if isinstance(e, str) else e for e in data[key]]
    if 'isbn' in data and isinstance(data['isbn'], list):
        isbns = set()
        for i in data['isbn']:
            i = to_isbn13(i)
            if i:
                isbns.add(i)
        if isbns:
            data['isbn'] = list(isbns)[0]
        else:
            del data['isbn']

    if 'author' in data:
        if isinstance(data['author'], str):
            if data['author'].strip():
                data['author'] = data['author'].strip().split('; ')
            else:
                del data['author']
    if 'author' in data and data['author'] in (['Administrator'], ['Default'], ['user']):
        del data['author']
    if not 'title' in data:
        data['title'] = os.path.splitext(os.path.basename(f))[0]
        if data['title'].startswith('Microsoft Word - '):
            data['title'] = data['title'][len('Microsoft Word - '):]
        for postfix in ('.doc', 'docx', '.qxd', '.indd', '.tex'):
            if data['title'].endswith(postfix):
                data['title'] = data['title'][:-len(postfix)]
        if not data['title'].strip():
            del data['title']
    data = decode_html_data(data)
    data = strip_tags_data(data)
    for key in list(data):
        if not data[key]:
            del data[key]
    for key in [k['id'] for k in settings.config['itemKeys'] if isinstance(k['type'], list)]:
        if key in data and not isinstance(data[key], list):
            data[key] = [data[key]] if data[key] else []
    return data

def extract_text(path):
    ext = path.split('.')[-1]
    text = ''
    try:
        if ext in ('epub', 'kepub'):
            text = epub.extract_text(path)
        elif ext == 'pdf':
            text = pdf.extract_text(path)
        elif ext == 'txt':
            text = txt.extract_text(path)
    except:
        logger.debug('failed to extract text from %s', path, exc_info=True)
        text = ''
    return text
and more... 2014-05-12 23:43:27 +00:00			`# -- coding: utf-8 --`
port to python3 2014-09-02 22:32:44 +00:00
and more... 2014-05-12 23:43:27 +00:00
			`import base64`
			`import hashlib`
			`import os`
port to python3 2014-09-02 22:32:44 +00:00			`import codecs`
and more... 2014-05-12 23:43:27 +00:00
			`import ox`

port to python3 2014-09-02 22:32:44 +00:00			`from . import pdf`
inital cbr support 2015-03-14 07:35:15 +00:00			`from . import cbr`
fix cbz import 2016-02-27 07:31:29 +00:00			`cbz = cbr`
port to python3 2014-09-02 22:32:44 +00:00			`from . import epub`
			`from . import txt`
			`from . import opf`
Open Media Library 2014-05-04 17:26:43 +00:00
strip html tags from book metadata 2016-02-04 09:55:27 +00:00			`from meta.utils import decode_html_data, strip_tags_data, to_isbn13`
avoid [''] 2016-01-13 04:41:31 +00:00			`import settings`
decode html 2016-01-08 10:44:18 +00:00
try not to break if file can not be parsed 2016-01-15 07:33:42 +00:00			`import logging`
			`logger = logging.getLogger(__name__)`

and more... 2014-05-12 23:43:27 +00:00			`def get_id(f=None, data=None):`
			`if data:`
port to python3 2014-09-02 22:32:44 +00:00			`return base64.b32encode(hashlib.sha1(data).digest()).decode()`
and more... 2014-05-12 23:43:27 +00:00			`else:`
port to python3 2014-09-02 22:32:44 +00:00			`return base64.b32encode(codecs.decode(ox.sha1sum(f, cached=True), 'hex')).decode()`
and more... 2014-05-12 23:43:27 +00:00
use metadata.opf or metadata if available 2014-05-27 14:08:14 +00:00			`def metadata(f, from_=None):`
make extension lower case 2016-02-13 10:28:06 +00:00			`ext = f.split('.')[-1].lower()`
Open Media Library 2014-05-04 17:26:43 +00:00			`data = {}`
import/lists/autocompleteFolder 2014-05-18 23:24:04 +00:00			`data['extension'] = ext`
			`data['size'] = os.stat(f).st_size`
use metadata.opf or metadata if available 2014-05-27 14:08:14 +00:00
try not to break if file can not be parsed 2016-01-15 07:33:42 +00:00			`try:`
fix cbz import 2016-02-18 15:05:54 +00:00			`if ext in ('cbr', 'cbz'):`
try not to break if file can not be parsed 2016-01-15 07:33:42 +00:00			`info = cbr.info(f)`
			`elif ext in ('epub', 'kepub'):`
			`info = epub.info(f)`
			`data['extension'] = 'epub'`
			`elif ext == 'pdf':`
			`info = pdf.info(f)`
			`elif ext == 'txt':`
			`info = txt.info(f)`
fix cbz import 2016-02-18 15:05:54 +00:00			`else:`
			`logger.error('unknown extension %s', f, ext)`
			`info = {}`
try not to break if file can not be parsed 2016-01-15 07:33:42 +00:00			`except:`
s/exc_info=1/exc_info=True/g 2016-01-24 09:13:03 +00:00			`logger.debug('failed to load %s info from %s', ext, f, exc_info=True)`
try not to break if file can not be parsed 2016-01-15 07:33:42 +00:00			`info = {}`
Open Media Library 2014-05-04 17:26:43 +00:00
use metadata.opf or metadata if available 2014-05-27 14:08:14 +00:00			`opf_info = {}`
			`metadata_opf = os.path.join(os.path.dirname(from_ or f), 'metadata.opf')`
			`if os.path.exists(metadata_opf):`
			`opf_info = opf.info(metadata_opf)`
extract textsize, take timestamp for changelog entries update peers on peering events 2014-05-19 09:38:41 +00:00			`for key in (`
allow all meta keys from file 2016-01-11 14:29:07 +00:00			`'author',`
			`'categories',`
			`'cover',`
			`'date',`
			`'description',`
			`'edition',`
			`'isbn',`
			`'language',`
			`'pages',`
			`'place',`
			`'publisher',`
			`'series',`
			`'tableofcontents',`
			`'title',`

			`'textsize',`
extract textsize, take timestamp for changelog entries update peers on peering events 2014-05-19 09:38:41 +00:00			`):`
Open Media Library 2014-05-04 17:26:43 +00:00			`if key in info:`
			`value = info[key]`
fix epub metadata parser 2015-12-01 16:20:32 +00:00			`if isinstance(value, bytes):`
Open Media Library 2014-05-04 17:26:43 +00:00			`try:`
			`value = value.decode('utf-8')`
			`except:`
			`value = None`
			`if value:`
			`data[key] = info[key]`
use metadata.opf or metadata if available 2014-05-27 14:08:14 +00:00			`if key in opf_info:`
			`data[key] = opf_info[key]`
scan in tasks queue 2014-05-28 15:36:26 +00:00			`if key in data:`
port to python3 2014-09-02 22:32:44 +00:00			`if isinstance(data[key], str):`
remove null strings from file metadata 2014-05-28 11:36:44 +00:00			`data[key] = data[key].replace('\x00', '')`
			`elif isinstance(data[key], list):`
port to python3 2014-09-02 22:32:44 +00:00			`data[key] = [e.replace('\x00', '') if isinstance(e, str) else e for e in data[key]]`
store metadata per user. remove primaryid. only store isbn13 2016-01-11 13:43:54 +00:00			`if 'isbn' in data and isinstance(data['isbn'], list):`
			`isbns = set()`
			`for i in data['isbn']:`
			`i = to_isbn13(i)`
			`if i:`
			`isbns.add(i)`
			`if isbns:`
			`data['isbn'] = list(isbns)[0]`
			`else:`
			`del data['isbn']`

cleanup meta parser 2014-05-26 08:23:10 +00:00			`if 'author' in data:`
port to python3 2014-09-02 22:32:44 +00:00			`if isinstance(data['author'], str):`
query fixes, avoid empty authors 2014-05-27 18:10:55 +00:00			`if data['author'].strip():`
			`data['author'] = data['author'].strip().split('; ')`
			`else:`
			`del data['author']`
fix empty author 2015-12-24 13:37:36 +00:00			`if 'author' in data and data['author'] in (['Administrator'], ['Default'], ['user']):`
			`del data['author']`
Open Media Library 2014-05-04 17:26:43 +00:00			`if not 'title' in data:`
			`data['title'] = os.path.splitext(os.path.basename(f))[0]`
cleanup meta parser 2014-05-26 08:23:10 +00:00			`if data['title'].startswith('Microsoft Word - '):`
			`data['title'] = data['title'][len('Microsoft Word - '):]`
cache file hash 2014-05-26 09:27:20 +00:00			`for postfix in ('.doc', 'docx', '.qxd', '.indd', '.tex'):`
cleanup meta parser 2014-05-26 08:23:10 +00:00			`if data['title'].endswith(postfix):`
			`data['title'] = data['title'][:-len(postfix)]`
			`if not data['title'].strip():`
			`del data['title']`
decode html 2016-01-08 10:44:18 +00:00			`data = decode_html_data(data)`
strip html tags from book metadata 2016-02-04 09:55:27 +00:00			`data = strip_tags_data(data)`
avoid [''] 2016-01-13 04:41:31 +00:00			`for key in list(data):`
			`if not data[key]:`
			`del data[key]`
			`for key in [k['id'] for k in settings.config['itemKeys'] if isinstance(k['type'], list)]:`
			`if key in data and not isinstance(data[key], list):`
			`data[key] = [data[key]] if data[key] else []`
Open Media Library 2014-05-04 17:26:43 +00:00			`return data`

add meta.extract_text 2016-01-19 15:18:25 +00:00			`def extract_text(path):`
			`ext = path.split('.')[-1]`
			`text = ''`
			`try:`
			`if ext in ('epub', 'kepub'):`
			`text = epub.extract_text(path)`
			`elif ext == 'pdf':`
			`text = pdf.extract_text(path)`
			`elif ext == 'txt':`
			`text = txt.extract_text(path)`
			`except:`
s/exc_info=1/exc_info=True/g 2016-01-24 09:13:03 +00:00			`logger.debug('failed to extract text from %s', path, exc_info=True)`
add meta.extract_text 2016-01-19 15:18:25 +00:00			`text = ''`
			`return text`