openmedialibrary/oml/media/__init__.py

140 lines
4 KiB
Python
Raw Normal View History

2014-05-12 23:43:27 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
2014-09-02 22:32:44 +00:00
2014-05-12 23:43:27 +00:00
import base64
import hashlib
import os
2014-09-02 22:32:44 +00:00
import codecs
2014-05-12 23:43:27 +00:00
import ox
2014-09-02 22:32:44 +00:00
from . import pdf
2015-03-14 07:35:15 +00:00
from . import cbr
2014-09-02 22:32:44 +00:00
from . import epub
from . import txt
from . import opf
2014-05-04 17:26:43 +00:00
from meta.utils import decode_html_data, to_isbn13
2016-01-13 04:41:31 +00:00
import settings
2016-01-08 10:44:18 +00:00
import logging
logger = logging.getLogger(__name__)
2014-05-12 23:43:27 +00:00
def get_id(f=None, data=None):
if data:
2014-09-02 22:32:44 +00:00
return base64.b32encode(hashlib.sha1(data).digest()).decode()
2014-05-12 23:43:27 +00:00
else:
2014-09-02 22:32:44 +00:00
return base64.b32encode(codecs.decode(ox.sha1sum(f, cached=True), 'hex')).decode()
2014-05-12 23:43:27 +00:00
def metadata(f, from_=None):
2014-05-04 17:26:43 +00:00
ext = f.split('.')[-1]
data = {}
2014-05-18 23:24:04 +00:00
data['extension'] = ext
data['size'] = os.stat(f).st_size
try:
if ext == 'cbr':
info = cbr.info(f)
elif ext in ('epub', 'kepub'):
info = epub.info(f)
data['extension'] = 'epub'
elif ext == 'pdf':
info = pdf.info(f)
elif ext == 'txt':
info = txt.info(f)
except:
2016-01-24 09:13:03 +00:00
logger.debug('failed to load %s info from %s', ext, f, exc_info=True)
info = {}
2014-05-04 17:26:43 +00:00
opf_info = {}
metadata_opf = os.path.join(os.path.dirname(from_ or f), 'metadata.opf')
if os.path.exists(metadata_opf):
opf_info = opf.info(metadata_opf)
for key in (
2016-01-11 14:29:07 +00:00
'author',
'categories',
'cover',
'date',
'description',
'edition',
'isbn',
'language',
'pages',
'place',
'publisher',
'series',
'tableofcontents',
'title',
'asin',
'textsize',
):
2014-05-04 17:26:43 +00:00
if key in info:
value = info[key]
2015-12-01 16:20:32 +00:00
if isinstance(value, bytes):
2014-05-04 17:26:43 +00:00
try:
value = value.decode('utf-8')
except:
value = None
if value:
data[key] = info[key]
if key in opf_info:
data[key] = opf_info[key]
2014-05-28 15:36:26 +00:00
if key in data:
2014-09-02 22:32:44 +00:00
if isinstance(data[key], str):
2014-05-28 11:36:44 +00:00
data[key] = data[key].replace('\x00', '')
elif isinstance(data[key], list):
2014-09-02 22:32:44 +00:00
data[key] = [e.replace('\x00', '') if isinstance(e, str) else e for e in data[key]]
if 'isbn' in data and isinstance(data['isbn'], list):
isbns = set()
for i in data['isbn']:
i = to_isbn13(i)
if i:
isbns.add(i)
if isbns:
data['isbn'] = list(isbns)[0]
else:
del data['isbn']
2014-05-26 08:23:10 +00:00
if 'author' in data:
2014-09-02 22:32:44 +00:00
if isinstance(data['author'], str):
2014-05-27 18:10:55 +00:00
if data['author'].strip():
data['author'] = data['author'].strip().split('; ')
else:
del data['author']
2015-12-24 13:37:36 +00:00
if 'author' in data and data['author'] in (['Administrator'], ['Default'], ['user']):
del data['author']
2014-05-04 17:26:43 +00:00
if not 'title' in data:
data['title'] = os.path.splitext(os.path.basename(f))[0]
2014-05-26 08:23:10 +00:00
if data['title'].startswith('Microsoft Word - '):
data['title'] = data['title'][len('Microsoft Word - '):]
2014-05-26 09:27:20 +00:00
for postfix in ('.doc', 'docx', '.qxd', '.indd', '.tex'):
2014-05-26 08:23:10 +00:00
if data['title'].endswith(postfix):
data['title'] = data['title'][:-len(postfix)]
if not data['title'].strip():
del data['title']
2016-01-08 10:44:18 +00:00
data = decode_html_data(data)
2016-01-13 04:41:31 +00:00
for key in list(data):
if not data[key]:
del data[key]
for key in [k['id'] for k in settings.config['itemKeys'] if isinstance(k['type'], list)]:
if key in data and not isinstance(data[key], list):
data[key] = [data[key]] if data[key] else []
2014-05-04 17:26:43 +00:00
return data
2016-01-19 15:18:25 +00:00
def extract_text(path):
ext = path.split('.')[-1]
text = ''
try:
if ext in ('epub', 'kepub'):
text = epub.extract_text(path)
elif ext == 'pdf':
text = pdf.extract_text(path)
elif ext == 'txt':
text = txt.extract_text(path)
except:
2016-01-24 09:13:03 +00:00
logger.debug('failed to extract text from %s', path, exc_info=True)
2016-01-19 15:18:25 +00:00
text = ''
return text