Open Media Library
This commit is contained in:
commit
2ee2bc178a
228 changed files with 85988 additions and 0 deletions
45
oml/media/__init__.py
Normal file
45
oml/media/__init__.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
import pdf
|
||||
import epub
|
||||
import txt
|
||||
import os
|
||||
import base64
|
||||
import ox
|
||||
|
||||
def get_id(f):
|
||||
return base64.b32encode(ox.sha1sum(f).decode('hex'))
|
||||
|
||||
def metadata(f):
|
||||
ext = f.split('.')[-1]
|
||||
data = {}
|
||||
if ext == 'pdf':
|
||||
info = pdf.info(f)
|
||||
elif ext == 'epub':
|
||||
info = epub.info(f)
|
||||
elif ext == 'txt':
|
||||
info = txt.info(f)
|
||||
|
||||
for key in ('title', 'author', 'date', 'publisher', 'isbn'):
|
||||
if key in info:
|
||||
value = info[key]
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
value = value.decode('utf-8')
|
||||
except:
|
||||
value = None
|
||||
if value:
|
||||
data[key] = info[key]
|
||||
|
||||
if 'isbn' in data:
|
||||
value = data.pop('isbn')
|
||||
if len(value) == 10:
|
||||
data['isbn10'] = value
|
||||
data['mainid'] = 'isbn10'
|
||||
else:
|
||||
data['isbn13'] = value
|
||||
data['mainid'] = 'isbn13'
|
||||
if not 'title' in data:
|
||||
data['title'] = os.path.splitext(os.path.basename(f))[0]
|
||||
if 'author' in data and isinstance(data['author'], basestring):
|
||||
data['author'] = [data['author']]
|
||||
return data
|
||||
|
||||
63
oml/media/epub.py
Normal file
63
oml/media/epub.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import division
|
||||
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from StringIO import StringIO
|
||||
|
||||
import Image
|
||||
import stdnum.isbn
|
||||
|
||||
from utils import normalize_isbn, find_isbns
|
||||
|
||||
def cover(path):
|
||||
img = Image.new('RGB', (80, 128))
|
||||
o = StringIO()
|
||||
img.save(o, format='jpeg')
|
||||
data = o.getvalue()
|
||||
o.close()
|
||||
return data
|
||||
|
||||
def info(epub):
|
||||
data = {}
|
||||
z = zipfile.ZipFile(epub)
|
||||
opf = [f.filename for f in z.filelist if f.filename.endswith('opf')]
|
||||
if opf:
|
||||
info = ET.fromstring(z.read(opf[0]))
|
||||
metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')[0]
|
||||
for e in metadata.getchildren():
|
||||
if e.text:
|
||||
key = e.tag.split('}')[-1]
|
||||
key = {
|
||||
'creator': 'author',
|
||||
}.get(key, key)
|
||||
value = e.text
|
||||
if key == 'identifier':
|
||||
value = normalize_isbn(value)
|
||||
if stdnum.isbn.is_valid(value):
|
||||
data['isbn'] = value
|
||||
else:
|
||||
data[key] = e.text
|
||||
text = extract_text(epub)
|
||||
data['textsize'] = len(text)
|
||||
if not 'isbn' in data:
|
||||
isbn = extract_isbn(text)
|
||||
if isbn:
|
||||
data['isbn'] = isbn
|
||||
return data
|
||||
|
||||
def extract_text(path):
|
||||
data = ''
|
||||
z = zipfile.ZipFile(path)
|
||||
for f in z.filelist:
|
||||
if f.filename.endswith('html'):
|
||||
data += z.read(f.filename)
|
||||
return data
|
||||
|
||||
def extract_isbn(data):
|
||||
isbns = find_isbns(data)
|
||||
if isbns:
|
||||
return isbns[0]
|
||||
|
||||
140
oml/media/pdf.py
Normal file
140
oml/media/pdf.py
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import division
|
||||
|
||||
import sys
|
||||
import tempfile
|
||||
import subprocess
|
||||
import os
|
||||
import shutil
|
||||
from glob import glob
|
||||
|
||||
from pyPdf import PdfFileReader
|
||||
import stdnum.isbn
|
||||
|
||||
import settings
|
||||
from utils import normalize_isbn, find_isbns
|
||||
|
||||
def cover(pdf):
|
||||
if sys.platform == 'darwin':
|
||||
return ql_cover(pdf)
|
||||
else:
|
||||
return page(pdf, 1)
|
||||
|
||||
def ql_cover(pdf):
|
||||
tmp = tempfile.mkdtemp()
|
||||
cmd = [
|
||||
'qlmanage',
|
||||
'-t',
|
||||
'-s',
|
||||
'1024',
|
||||
'-o',
|
||||
tmp,
|
||||
pdf
|
||||
]
|
||||
p = subprocess.Popen(cmd)
|
||||
p.wait()
|
||||
image = glob('%s/*' % tmp)[0]
|
||||
with open(image, 'rb') as fd:
|
||||
data = fd.read()
|
||||
shutil.rmtree(tmp)
|
||||
return data
|
||||
|
||||
|
||||
def page(pdf, page):
|
||||
image = tempfile.mkstemp('.jpg')[1]
|
||||
cmd = [
|
||||
'gs', '-q',
|
||||
'-dBATCH', '-dSAFER', '-dNOPAUSE', '-dNOPROMPT',
|
||||
'-dMaxBitmap=500000000',
|
||||
'-dAlignToPixels=0', '-dGridFitTT=2',
|
||||
'-sDEVICE=jpeg', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4',
|
||||
'-r72',
|
||||
'-dUseCropBox',
|
||||
'-dFirstPage=%d' % page,
|
||||
'-dLastPage=%d' % page,
|
||||
'-sOutputFile=%s' % image,
|
||||
pdf
|
||||
]
|
||||
p = subprocess.Popen(cmd)
|
||||
p.wait()
|
||||
with open(image, 'rb') as fd:
|
||||
data = fd.read()
|
||||
os.unlink(image)
|
||||
return data
|
||||
|
||||
def info(pdf):
|
||||
data = {}
|
||||
with open(pdf, 'rb') as fd:
|
||||
try:
|
||||
pdfreader = PdfFileReader(fd)
|
||||
info = pdfreader.getDocumentInfo()
|
||||
if info:
|
||||
for key in info:
|
||||
if info[key]:
|
||||
data[key[1:].lower()] = info[key]
|
||||
xmp =pdfreader.getXmpMetadata()
|
||||
if xmp:
|
||||
for key in dir(xmp):
|
||||
if key.startswith('dc_'):
|
||||
value = getattr(xmp, key)
|
||||
if isinstance(value, dict) and 'x-default' in value:
|
||||
value = value['x-default']
|
||||
elif isinstance(value, list):
|
||||
value = [v.strip() for v in value if v.strip()]
|
||||
_key = key[3:]
|
||||
if value and _key not in data:
|
||||
data[_key] = value
|
||||
except:
|
||||
print 'FAILED TO PARSE', pdf
|
||||
import traceback
|
||||
print traceback.print_exc()
|
||||
|
||||
if 'identifier' in data:
|
||||
value = normalize_isbn(data['identifier'])
|
||||
if stdnum.isbn.is_valid(value):
|
||||
data['isbn'] = value
|
||||
del data['identifier']
|
||||
'''
|
||||
cmd = ['pdfinfo', pdf]
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = p.communicate()
|
||||
for line in stdout.strip().split('\n'):
|
||||
parts = line.split(':')
|
||||
key = parts[0].lower().strip()
|
||||
if key:
|
||||
data[key] = ':'.join(parts[1:]).strip()
|
||||
for key in data.keys():
|
||||
if not data[key]:
|
||||
del data[key]
|
||||
'''
|
||||
text = extract_text(pdf)
|
||||
data['textsize'] = len(text)
|
||||
if settings.server['extract_text']:
|
||||
if not 'isbn' in data:
|
||||
isbn = extract_isbn(text)
|
||||
if isbn:
|
||||
data['isbn'] = isbn
|
||||
return data
|
||||
|
||||
'''
|
||||
#possbile alternative with gs
|
||||
tmp = tempfile.mkstemp('.txt')[1]
|
||||
cmd = ['gs', '-dBATCH', '-dNOPAUSE', '-sDEVICE=txtwrite', '-dFirstPage=3', '-dLastPage=5', '-sOutputFile=%s'%tmp, pdf]
|
||||
|
||||
'''
|
||||
def extract_text(pdf):
|
||||
if sys.platform == 'darwin':
|
||||
cmd = ['/usr/bin/mdimport' '-d2', pdf]
|
||||
else:
|
||||
cmd = ['pdftotext', pdf, '-']
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = p.communicate()
|
||||
if sys.platform == 'darwin':
|
||||
stdout = stderr.split('kMDItemTextContent = "')[-1].split('\n')[0][:-2]
|
||||
return stdout.strip()
|
||||
|
||||
def extract_isbn(text):
|
||||
isbns = find_isbns(text)
|
||||
if isbns:
|
||||
return isbns[0]
|
||||
41
oml/media/txt.py
Normal file
41
oml/media/txt.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import division
|
||||
|
||||
import sys
|
||||
import os
|
||||
from utils import find_isbns
|
||||
from StringIO import StringIO
|
||||
import Image
|
||||
|
||||
from pdf import ql_cover
|
||||
|
||||
def cover(path):
|
||||
if sys.platform == 'darwin':
|
||||
return ql_cover(path)
|
||||
img = Image.new('RGB', (80, 128))
|
||||
o = StringIO()
|
||||
img.save(o, format='jpeg')
|
||||
data = o.getvalue()
|
||||
o.close()
|
||||
return data
|
||||
|
||||
def info(path):
|
||||
data = {}
|
||||
data['title'] = os.path.splitext(os.path.basename(path))[0]
|
||||
text = extract_text(path)
|
||||
isbn = extract_isbn(text)
|
||||
if isbn:
|
||||
data['isbn'] = isbn
|
||||
data['textsize'] = len(text)
|
||||
return data
|
||||
|
||||
def extract_text(path):
|
||||
with open(path) as fd:
|
||||
data = fd.read()
|
||||
return data
|
||||
|
||||
def extract_isbn(text):
|
||||
isbns = find_isbns(text)
|
||||
if isbns:
|
||||
return isbns[0]
|
||||
Loading…
Add table
Add a link
Reference in a new issue