2014-05-12 23:43:27 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
|
|
from __future__ import division
|
|
|
|
|
|
|
|
import base64
|
|
|
|
import hashlib
|
|
|
|
import os
|
|
|
|
|
|
|
|
import ox
|
|
|
|
|
2014-05-04 17:26:43 +00:00
|
|
|
import pdf
|
|
|
|
import epub
|
|
|
|
import txt
|
|
|
|
|
2014-05-12 23:43:27 +00:00
|
|
|
def get_id(f=None, data=None):
|
|
|
|
if data:
|
|
|
|
return base64.b32encode(hashlib.sha1(data).digest())
|
|
|
|
else:
|
2014-05-26 09:27:20 +00:00
|
|
|
return base64.b32encode(ox.sha1sum(f, cached=True).decode('hex'))
|
2014-05-12 23:43:27 +00:00
|
|
|
|
2014-05-04 17:26:43 +00:00
|
|
|
|
|
|
|
def metadata(f):
|
|
|
|
ext = f.split('.')[-1]
|
|
|
|
data = {}
|
2014-05-18 23:24:04 +00:00
|
|
|
data['extension'] = ext
|
|
|
|
data['size'] = os.stat(f).st_size
|
2014-05-04 17:26:43 +00:00
|
|
|
if ext == 'pdf':
|
|
|
|
info = pdf.info(f)
|
|
|
|
elif ext == 'epub':
|
|
|
|
info = epub.info(f)
|
|
|
|
elif ext == 'txt':
|
|
|
|
info = txt.info(f)
|
|
|
|
|
2014-05-19 09:38:41 +00:00
|
|
|
for key in (
|
|
|
|
'title', 'author', 'date', 'publisher', 'isbn',
|
|
|
|
'textsize', 'pages'
|
|
|
|
):
|
2014-05-04 17:26:43 +00:00
|
|
|
if key in info:
|
|
|
|
value = info[key]
|
|
|
|
if isinstance(value, str):
|
|
|
|
try:
|
|
|
|
value = value.decode('utf-8')
|
|
|
|
except:
|
|
|
|
value = None
|
|
|
|
if value:
|
|
|
|
data[key] = info[key]
|
|
|
|
|
|
|
|
if 'isbn' in data:
|
2014-05-21 00:02:21 +00:00
|
|
|
data['primaryid'] = ['isbn', data['isbn'][0]]
|
2014-05-26 08:23:10 +00:00
|
|
|
if 'author' in data:
|
|
|
|
if isinstance(data['author'], basestring):
|
|
|
|
data['author'] = data['author'].split('; ')
|
|
|
|
if data['author'] in (['Administrator'], ['Default'], ['user']):
|
|
|
|
del data['author']
|
2014-05-04 17:26:43 +00:00
|
|
|
if not 'title' in data:
|
|
|
|
data['title'] = os.path.splitext(os.path.basename(f))[0]
|
2014-05-26 08:23:10 +00:00
|
|
|
if data['title'].startswith('Microsoft Word - '):
|
|
|
|
data['title'] = data['title'][len('Microsoft Word - '):]
|
2014-05-26 09:27:20 +00:00
|
|
|
for postfix in ('.doc', 'docx', '.qxd', '.indd', '.tex'):
|
2014-05-26 08:23:10 +00:00
|
|
|
if data['title'].endswith(postfix):
|
|
|
|
data['title'] = data['title'][:-len(postfix)]
|
|
|
|
if not data['title'].strip():
|
|
|
|
del data['title']
|
2014-05-04 17:26:43 +00:00
|
|
|
return data
|
|
|
|
|