# -*- coding: utf-8 -*- import base64 import hashlib import os import codecs import ox from . import pdf from . import cbr cbz = cbr from . import epub from . import txt from . import opf from meta.utils import decode_html_data, strip_tags_data, to_isbn13 import settings import logging logger = logging.getLogger(__name__) def get_id(f=None, data=None): if data: return base64.b32encode(hashlib.sha1(data).digest()).decode() else: return base64.b32encode(codecs.decode(ox.sha1sum(f, cached=True), 'hex')).decode() def metadata(f, from_=None): ext = f.split('.')[-1].lower() data = {} data['extension'] = ext data['size'] = os.stat(f).st_size try: if ext in ('cbr', 'cbz'): info = cbr.info(f) elif ext in ('epub', 'kepub'): info = epub.info(f) data['extension'] = 'epub' elif ext == 'pdf': info = pdf.info(f) elif ext == 'txt': info = txt.info(f) else: logger.error('unknown extension %s', f, ext) info = {} except: logger.debug('failed to load %s info from %s', ext, f, exc_info=True) info = {} opf_info = {} metadata_opf = os.path.join(os.path.dirname(from_ or f), 'metadata.opf') if os.path.exists(metadata_opf): opf_info = opf.info(metadata_opf) for key in ( 'author', 'categories', 'cover', 'date', 'description', 'edition', 'isbn', 'language', 'pages', 'place', 'publisher', 'series', 'tableofcontents', 'title', 'textsize', ): if key in info: value = info[key] if isinstance(value, bytes): try: value = value.decode('utf-8') except: value = None if value: data[key] = info[key] if key in opf_info: data[key] = opf_info[key] if key in data: if isinstance(data[key], str): data[key] = data[key].replace('\x00', '') elif isinstance(data[key], list): data[key] = [e.replace('\x00', '') if isinstance(e, str) else e for e in data[key]] if 'isbn' in data and isinstance(data['isbn'], list): isbns = set() for i in data['isbn']: i = to_isbn13(i) if i: isbns.add(i) if isbns: data['isbn'] = list(isbns)[0] else: del data['isbn'] if 'author' in data: if isinstance(data['author'], str): if data['author'].strip(): data['author'] = data['author'].strip().split('; ') else: del data['author'] if 'author' in data and data['author'] in (['Administrator'], ['Default'], ['user']): del data['author'] if not 'title' in data: data['title'] = os.path.splitext(os.path.basename(f))[0] if data['title'].startswith('Microsoft Word - '): data['title'] = data['title'][len('Microsoft Word - '):] for postfix in ('.doc', 'docx', '.qxd', '.indd', '.tex'): if data['title'].endswith(postfix): data['title'] = data['title'][:-len(postfix)] if not data['title'].strip(): del data['title'] data = decode_html_data(data) data = strip_tags_data(data) for key in list(data): if not data[key]: del data[key] for key in [k['id'] for k in settings.config['itemKeys'] if isinstance(k['type'], list)]: if key in data and not isinstance(data[key], list): data[key] = [data[key]] if data[key] else [] return data def extract_text(path): ext = path.split('.')[-1] text = '' try: if ext in ('epub', 'kepub'): text = epub.extract_text(path) elif ext == 'pdf': text = pdf.extract_text(path) elif ext == 'txt': text = txt.extract_text(path) except: logger.debug('failed to extract text from %s', path, exc_info=True) text = '' return text