Open Media Library

2014-05-04 19:26:43 +02:00 · 2014-05-04 19:26:43 +02:00 · 2ee2bc178a
commit 2ee2bc178a
228 changed files with 85988 additions and 0 deletions
--- a/oml/media/epub.py
+++ b/oml/media/epub.py
@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from __future__ import division
+
+import sys
+import xml.etree.ElementTree as ET
+import zipfile
+from StringIO import StringIO
+
+import Image
+import stdnum.isbn
+
+from utils import normalize_isbn, find_isbns
+
+def cover(path):
+    img = Image.new('RGB', (80, 128))
+    o = StringIO()
+    img.save(o, format='jpeg')
+    data = o.getvalue()
+    o.close()
+    return data
+
+def info(epub):
+    data = {}
+    z = zipfile.ZipFile(epub)
+    opf = [f.filename for f in z.filelist if f.filename.endswith('opf')]
+    if opf:
+        info = ET.fromstring(z.read(opf[0]))
+        metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')[0]
+        for e in metadata.getchildren():
+            if e.text:
+                key = e.tag.split('}')[-1]
+                key = {
+                    'creator': 'author',
+                }.get(key, key)
+                value = e.text
+                if key == 'identifier':
+                    value = normalize_isbn(value)
+                    if stdnum.isbn.is_valid(value):
+                        data['isbn'] = value
+                else:
+                    data[key] = e.text
+    text = extract_text(epub)
+    data['textsize'] = len(text)
+    if not 'isbn' in data:
+        isbn = extract_isbn(text)
+        if isbn:
+            data['isbn'] = isbn
+    return data
+
+def extract_text(path):
+    data = ''
+    z = zipfile.ZipFile(path)
+    for f in z.filelist:
+        if f.filename.endswith('html'):
+            data += z.read(f.filename)
+    return data
+
+def extract_isbn(data):
+    isbns = find_isbns(data)
+    if isbns:
+        return isbns[0]
+