139 lines
4 KiB
Python
139 lines
4 KiB
Python
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
|
|
|
|
import base64
|
|
import hashlib
|
|
import os
|
|
import codecs
|
|
|
|
import ox
|
|
|
|
from . import pdf
|
|
from . import cbr
|
|
from . import epub
|
|
from . import txt
|
|
from . import opf
|
|
|
|
from meta.utils import decode_html_data, strip_tags_data, to_isbn13
|
|
import settings
|
|
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def get_id(f=None, data=None):
|
|
if data:
|
|
return base64.b32encode(hashlib.sha1(data).digest()).decode()
|
|
else:
|
|
return base64.b32encode(codecs.decode(ox.sha1sum(f, cached=True), 'hex')).decode()
|
|
|
|
def metadata(f, from_=None):
|
|
ext = f.split('.')[-1].lower()
|
|
data = {}
|
|
data['extension'] = ext
|
|
data['size'] = os.stat(f).st_size
|
|
|
|
try:
|
|
if ext == 'cbr':
|
|
info = cbr.info(f)
|
|
elif ext in ('epub', 'kepub'):
|
|
info = epub.info(f)
|
|
data['extension'] = 'epub'
|
|
elif ext == 'pdf':
|
|
info = pdf.info(f)
|
|
elif ext == 'txt':
|
|
info = txt.info(f)
|
|
except:
|
|
logger.debug('failed to load %s info from %s', ext, f, exc_info=True)
|
|
info = {}
|
|
|
|
opf_info = {}
|
|
metadata_opf = os.path.join(os.path.dirname(from_ or f), 'metadata.opf')
|
|
if os.path.exists(metadata_opf):
|
|
opf_info = opf.info(metadata_opf)
|
|
for key in (
|
|
'author',
|
|
'categories',
|
|
'cover',
|
|
'date',
|
|
'description',
|
|
'edition',
|
|
'isbn',
|
|
'language',
|
|
'pages',
|
|
'place',
|
|
'publisher',
|
|
'series',
|
|
'tableofcontents',
|
|
'title',
|
|
|
|
'textsize',
|
|
):
|
|
if key in info:
|
|
value = info[key]
|
|
if isinstance(value, bytes):
|
|
try:
|
|
value = value.decode('utf-8')
|
|
except:
|
|
value = None
|
|
if value:
|
|
data[key] = info[key]
|
|
if key in opf_info:
|
|
data[key] = opf_info[key]
|
|
if key in data:
|
|
if isinstance(data[key], str):
|
|
data[key] = data[key].replace('\x00', '')
|
|
elif isinstance(data[key], list):
|
|
data[key] = [e.replace('\x00', '') if isinstance(e, str) else e for e in data[key]]
|
|
if 'isbn' in data and isinstance(data['isbn'], list):
|
|
isbns = set()
|
|
for i in data['isbn']:
|
|
i = to_isbn13(i)
|
|
if i:
|
|
isbns.add(i)
|
|
if isbns:
|
|
data['isbn'] = list(isbns)[0]
|
|
else:
|
|
del data['isbn']
|
|
|
|
if 'author' in data:
|
|
if isinstance(data['author'], str):
|
|
if data['author'].strip():
|
|
data['author'] = data['author'].strip().split('; ')
|
|
else:
|
|
del data['author']
|
|
if 'author' in data and data['author'] in (['Administrator'], ['Default'], ['user']):
|
|
del data['author']
|
|
if not 'title' in data:
|
|
data['title'] = os.path.splitext(os.path.basename(f))[0]
|
|
if data['title'].startswith('Microsoft Word - '):
|
|
data['title'] = data['title'][len('Microsoft Word - '):]
|
|
for postfix in ('.doc', 'docx', '.qxd', '.indd', '.tex'):
|
|
if data['title'].endswith(postfix):
|
|
data['title'] = data['title'][:-len(postfix)]
|
|
if not data['title'].strip():
|
|
del data['title']
|
|
data = decode_html_data(data)
|
|
data = strip_tags_data(data)
|
|
for key in list(data):
|
|
if not data[key]:
|
|
del data[key]
|
|
for key in [k['id'] for k in settings.config['itemKeys'] if isinstance(k['type'], list)]:
|
|
if key in data and not isinstance(data[key], list):
|
|
data[key] = [data[key]] if data[key] else []
|
|
return data
|
|
|
|
def extract_text(path):
|
|
ext = path.split('.')[-1]
|
|
text = ''
|
|
try:
|
|
if ext in ('epub', 'kepub'):
|
|
text = epub.extract_text(path)
|
|
elif ext == 'pdf':
|
|
text = pdf.extract_text(path)
|
|
elif ext == 'txt':
|
|
text = txt.extract_text(path)
|
|
except:
|
|
logger.debug('failed to extract text from %s', path, exc_info=True)
|
|
text = ''
|
|
return text
|