minimal support for epub documents
This commit is contained in:
parent
68ad3562c4
commit
3c69c0c101
50 changed files with 30209 additions and 10 deletions
189
pandora/document/epub.py
Normal file
189
pandora/document/epub.py
Normal file
|
|
@ -0,0 +1,189 @@
|
|||
import os
|
||||
import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
import re
|
||||
from urllib.parse import unquote
|
||||
import lxml.html
|
||||
from io import BytesIO
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from ox import strip_tags, decode_html, normalize_name
|
||||
|
||||
import logging
|
||||
logging.getLogger('PIL').setLevel(logging.ERROR)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_ratio(data):
|
||||
try:
|
||||
img = Image.open(BytesIO(data))
|
||||
return img.size[0]/img.size[1]
|
||||
except:
|
||||
return -1
|
||||
|
||||
|
||||
def normpath(path):
|
||||
return '/'.join(os.path.normpath(path).split(os.sep))
|
||||
|
||||
|
||||
def cover(path):
|
||||
logger.debug('cover %s', path)
|
||||
data = None
|
||||
try:
|
||||
z = zipfile.ZipFile(path)
|
||||
except zipfile.BadZipFile:
|
||||
logger.debug('invalid epub file %s', path)
|
||||
return data
|
||||
|
||||
def use(filename):
|
||||
logger.debug('using %s', filename)
|
||||
try:
|
||||
data = z.read(filename)
|
||||
except:
|
||||
return None
|
||||
r = get_ratio(data)
|
||||
if r < 0.3 or r > 2:
|
||||
return None
|
||||
return data
|
||||
|
||||
files = []
|
||||
for f in z.filelist:
|
||||
if f.filename == 'calibre-logo.png':
|
||||
continue
|
||||
if 'cover' in f.filename.lower() and f.filename.split('.')[-1] in ('jpg', 'jpeg', 'png'):
|
||||
return use(f.filename)
|
||||
files.append(f.filename)
|
||||
opf = [f for f in files if f.endswith('opf')]
|
||||
if opf:
|
||||
#logger.debug('opf: %s', z.read(opf[0]).decode())
|
||||
info = ET.fromstring(z.read(opf[0]))
|
||||
metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')
|
||||
if metadata:
|
||||
metadata = metadata[0]
|
||||
manifest = info.findall('{http://www.idpf.org/2007/opf}manifest')
|
||||
if manifest:
|
||||
manifest = manifest[0]
|
||||
if metadata and manifest:
|
||||
for e in list(metadata):
|
||||
if e.tag == '{http://www.idpf.org/2007/opf}meta' and e.attrib.get('name') == 'cover':
|
||||
cover_id = e.attrib['content']
|
||||
for e in list(manifest):
|
||||
if e.attrib['id'] == cover_id:
|
||||
filename = unquote(e.attrib['href'])
|
||||
filename = normpath(os.path.join(os.path.dirname(opf[0]), filename))
|
||||
if filename in files:
|
||||
return use(filename)
|
||||
if manifest:
|
||||
images = [e for e in list(manifest) if 'image' in e.attrib['media-type']]
|
||||
if images:
|
||||
image_data = []
|
||||
for e in images:
|
||||
filename = unquote(e.attrib['href'])
|
||||
filename = normpath(os.path.join(os.path.dirname(opf[0]), filename))
|
||||
if filename in files:
|
||||
image_data.append(filename)
|
||||
if image_data:
|
||||
image_data.sort(key=lambda name: z.getinfo(name).file_size)
|
||||
return use(image_data[-1])
|
||||
for e in list(manifest):
|
||||
if 'html' in e.attrib['media-type']:
|
||||
filename = unquote(e.attrib['href'])
|
||||
filename = normpath(os.path.join(os.path.dirname(opf[0]), filename))
|
||||
html = z.read(filename).decode('utf-8', 'ignore')
|
||||
img = re.compile('<img.*?src="(.*?)"').findall(html)
|
||||
#svg image
|
||||
img += re.compile('<image.*?href="(.*?)"').findall(html)
|
||||
if img:
|
||||
img = unquote(img[0])
|
||||
img = normpath(os.path.join(os.path.dirname(filename), img))
|
||||
if img in files:
|
||||
return use(img)
|
||||
return data
|
||||
|
||||
|
||||
def info(epub):
|
||||
data = {}
|
||||
try:
|
||||
z = zipfile.ZipFile(epub)
|
||||
except zipfile.BadZipFile:
|
||||
logger.debug('invalid epub file %s', epub)
|
||||
return data
|
||||
files = [f.filename for f in z.filelist]
|
||||
opf = [f for f in files if f.endswith('opf')]
|
||||
if opf:
|
||||
info = ET.fromstring(z.read(opf[0]))
|
||||
metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')
|
||||
if metadata:
|
||||
metadata = metadata[0]
|
||||
for e in list(metadata):
|
||||
if e.text and e.text.strip() and e.text not in ('unknown', 'none'):
|
||||
key = e.tag.split('}')[-1]
|
||||
key = {
|
||||
'creator': 'author',
|
||||
}.get(key, key)
|
||||
value = e.text.strip()
|
||||
if key == 'identifier':
|
||||
if value:
|
||||
data['isbn'] = value
|
||||
elif key == 'author':
|
||||
data[key] = value.split(', ')
|
||||
if len(data[key]) == 2 and max(len(d.split(' ')) for d in data[key]) == 1:
|
||||
data[key] = [normalize_name(', '.join(data[key]))]
|
||||
else:
|
||||
data[key] = value
|
||||
toc = [f for f in files if 'toc.ncx' in f]
|
||||
if toc:
|
||||
try:
|
||||
_toc = ET.fromstring(z.read(toc[0]))
|
||||
nav_map = _toc.find('{http://www.daisy.org/z3986/2005/ncx/}navMap')
|
||||
except:
|
||||
logger.debug('failed to parse toc', exc_info=True)
|
||||
nav_map = None
|
||||
if nav_map:
|
||||
contents = []
|
||||
for point in nav_map.findall('{http://www.daisy.org/z3986/2005/ncx/}navPoint'):
|
||||
label = point.find('{http://www.daisy.org/z3986/2005/ncx/}navLabel')
|
||||
if label:
|
||||
txt = list(label)[0].text
|
||||
if txt:
|
||||
contents.append(txt)
|
||||
if contents:
|
||||
data['tableofcontents'] = '\n'.join(contents).strip()
|
||||
if 'tableofcontents' not in data:
|
||||
guide = info.find('{http://www.idpf.org/2007/opf}guide')
|
||||
if guide:
|
||||
for ref in guide.findall('{http://www.idpf.org/2007/opf}reference'):
|
||||
if ref.attrib.get('type') == 'toc':
|
||||
filename = unquote(ref.attrib['href']).split('#')[0]
|
||||
filename = normpath(os.path.join(os.path.dirname(opf[0]), filename))
|
||||
if filename in files:
|
||||
toc = z.read(filename)
|
||||
if toc:
|
||||
doc = lxml.html.document_fromstring(toc)
|
||||
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')]).strip()
|
||||
if 'description' in data:
|
||||
data['description'] = strip_tags(decode_html(data['description']))
|
||||
text = extract_text(epub)
|
||||
data['textsize'] = len(text)
|
||||
if 'date' in data and 'T' in data['date']:
|
||||
data['date'] = data['date'].split('T')[0]
|
||||
if 'language' in data and isinstance(data['language'], str):
|
||||
data['language'] = get_language(data['language'])
|
||||
for key in list(data):
|
||||
if isinstance(data[key], str) and not data[key].strip():
|
||||
del data[key]
|
||||
return data
|
||||
|
||||
def extract_text(path):
|
||||
data = ''
|
||||
z = zipfile.ZipFile(path)
|
||||
for f in z.filelist:
|
||||
if '/._' in f.filename or f.filename.startswith('._'):
|
||||
continue
|
||||
if 'META-INF' in f.filename:
|
||||
continue
|
||||
if f.filename.split('.')[-1] in ('html', 'xml', 'htm'):
|
||||
data += z.read(f.filename).decode('utf-8', 'ignore')
|
||||
return data
|
||||
|
||||
|
|
@ -5,6 +5,8 @@ import tempfile
|
|||
|
||||
from django.conf import settings
|
||||
|
||||
from . import epub
|
||||
|
||||
|
||||
logger = logging.getLogger('pandora.' + __name__)
|
||||
|
||||
|
|
@ -55,6 +57,8 @@ class FulltextMixin:
|
|||
if self.file:
|
||||
if self.extension == 'pdf':
|
||||
return extract_text(self.file.path)
|
||||
elif self.extension == 'epub':
|
||||
return epub.extract_text(self.file.path)
|
||||
elif self.extension in IMAGE_EXTENSIONS:
|
||||
return ocr_image(self.file.path)
|
||||
elif self.extension in CONVERT_EXTENSIONS:
|
||||
|
|
@ -184,6 +188,9 @@ class FulltextPageMixin(FulltextMixin):
|
|||
if self.document.file:
|
||||
if self.document.extension == 'pdf':
|
||||
return extract_text(self.document.file.path, self.page)
|
||||
elif self.extension == 'epub':
|
||||
# FIXME: is there a nice way to split that into pages
|
||||
return epub.extract_text(self.file.path)
|
||||
elif self.extension in IMAGE_EXTENSIONS:
|
||||
return ocr_image(self.document.file.path)
|
||||
elif self.extension == 'html':
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ from user.utils import update_groups
|
|||
from . import managers
|
||||
from . import utils
|
||||
from . import tasks
|
||||
from . import epub
|
||||
from .fulltext import FulltextMixin, FulltextPageMixin
|
||||
|
||||
User = get_user_model()
|
||||
|
|
@ -174,13 +175,15 @@ class Document(models.Model, FulltextMixin):
|
|||
if self.extension == 'pdf':
|
||||
prefix = 2
|
||||
value = self.pages
|
||||
elif self.extension == 'epub':
|
||||
prefix = 3
|
||||
value = self.pages
|
||||
elif self.extension == 'html':
|
||||
prefix = 1
|
||||
value = self.dimensions
|
||||
else:
|
||||
if self.extension == 'html':
|
||||
prefix = 1
|
||||
value = self.dimensions
|
||||
else:
|
||||
prefix = 0
|
||||
value = self.width * self.height
|
||||
prefix = 0
|
||||
value = self.width * self.height
|
||||
if value < 0:
|
||||
value = 0
|
||||
s.dimensions = ox.sort_string('%d' % prefix) + ox.sort_string('%d' % value)
|
||||
|
|
@ -390,7 +393,7 @@ class Document(models.Model, FulltextMixin):
|
|||
|
||||
@property
|
||||
def dimensions(self):
|
||||
if self.extension == 'pdf':
|
||||
if self.extension in ('pdf', 'epub'):
|
||||
return self.pages
|
||||
elif self.extension == 'html':
|
||||
return len(self.data.get('text', '').split(' '))
|
||||
|
|
@ -564,6 +567,13 @@ class Document(models.Model, FulltextMixin):
|
|||
path = os.path.join(folder, '%dp%d,%s.jpg' % (size, page, ','.join(map(str, crop))))
|
||||
if not os.path.exists(path):
|
||||
resize_image(src, path, size=size)
|
||||
elif self.extension == 'epub':
|
||||
path = os.path.join(folder, '1024.jpg')
|
||||
if os.path.exists(src) and not os.path.exists(path):
|
||||
data = epub.cover(src)
|
||||
if data:
|
||||
with open(path, "wb") as fd:
|
||||
fd.write(data)
|
||||
elif self.extension in ('jpg', 'png', 'gif', 'webp', 'heic', 'heif', 'cr2'):
|
||||
if os.path.exists(src):
|
||||
if size and page:
|
||||
|
|
@ -607,17 +617,24 @@ class Document(models.Model, FulltextMixin):
|
|||
self.width = -1
|
||||
self.height = -1
|
||||
self.pages = utils.pdfpages(self.file.path)
|
||||
elif self.extension == 'epub':
|
||||
thumb = self.thumbnail(1024)
|
||||
if thumb:
|
||||
self.width, self.height = open_image_rgb(thumb).size
|
||||
self.pages = 1
|
||||
elif self.width == -1:
|
||||
self.pages = -1
|
||||
self.width, self.height = open_image_rgb(self.file.path).size
|
||||
|
||||
def get_ratio(self):
|
||||
if self.extension == 'pdf':
|
||||
if self.extension in ('pdf', 'epub'):
|
||||
image = self.thumbnail(1024)
|
||||
try:
|
||||
size = Image.open(image).size
|
||||
except:
|
||||
size = [1, 1]
|
||||
elif self.extension == 'epub':
|
||||
size = [1, 1]
|
||||
else:
|
||||
if self.width > 0:
|
||||
size = self.resolution
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import mimetypes
|
|||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
import zipfile
|
||||
|
||||
import ox
|
||||
from ox.utils import json
|
||||
|
|
@ -15,7 +16,7 @@ from oxdjango.shortcuts import render_to_json_response, get_object_or_404_json,
|
|||
from django import forms
|
||||
from django.conf import settings
|
||||
from django.db.models import Count, Sum
|
||||
from django.http import HttpResponse
|
||||
from django.http import HttpResponse, Http404
|
||||
from django.shortcuts import render
|
||||
|
||||
from item import utils
|
||||
|
|
@ -557,3 +558,24 @@ def document(request, fragment):
|
|||
context['url'] = request.build_absolute_uri('/documents/' + fragment)
|
||||
context['settings'] = settings
|
||||
return render(request, "document.html", context)
|
||||
|
||||
def epub(request, id, filename):
|
||||
document = get_document_or_404_json(request, id)
|
||||
if not document.access(request.user):
|
||||
raise Http404
|
||||
if document.extension != 'epub':
|
||||
raise Http404
|
||||
z = zipfile.ZipFile(document.file.path)
|
||||
if filename == '':
|
||||
context = {}
|
||||
context["epub"] = document
|
||||
return render(request, "epub.html", context)
|
||||
elif filename not in [f.filename for f in z.filelist]:
|
||||
raise Http404
|
||||
else:
|
||||
content_type = {
|
||||
'xpgt': 'application/vnd.adobe-page-template+xml'
|
||||
}.get(filename.split('.')[0], mimetypes.guess_type(filename)[0]) or 'text/plain'
|
||||
content = z.read(filename)
|
||||
response = HttpResponse(content, content_type=content_type)
|
||||
return response
|
||||
|
|
|
|||
176
pandora/templates/epub.html
Normal file
176
pandora/templates/epub.html
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
<!DOCTYPE html>
|
||||
<html class="no-js">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
|
||||
<title></title>
|
||||
<meta name="description" content="">
|
||||
<meta name="viewport" content="width=device-width, user-scalable=no">
|
||||
<meta name="apple-mobile-web-app-capable" content="yes">
|
||||
|
||||
<link rel="stylesheet" href="/static/epub.js/css/normalize.css?3">
|
||||
<link rel="stylesheet" href="/static/epub.js/css/main.css?3">
|
||||
<link rel="stylesheet" href="/static/epub.js/css/popup.css?3">
|
||||
<link rel="stylesheet" href="/static/epub.js/css/annotations.css?3">
|
||||
<style>
|
||||
.arrow {
|
||||
-webkit-user-select: none;
|
||||
-moz-user-select: none;
|
||||
-o-user-select: none;
|
||||
-ms-user-select: none;
|
||||
user-select: text;
|
||||
|
||||
}
|
||||
#metainfo {
|
||||
display: none !important;
|
||||
}
|
||||
#main {
|
||||
border-radius: 0px;
|
||||
-webkit-transition: -webkit-transform .4s, width .2s;
|
||||
-moz-transition: -webkit-transform .4s, width .2s;
|
||||
-ms-transition: -webkit-transform .4s, width .2s;
|
||||
|
||||
-moz-box-shadow: none;
|
||||
-webkit-box-shadow: none;
|
||||
-ms-box-shadow: none;
|
||||
box-shadow: none;
|
||||
}
|
||||
#sidebar {
|
||||
background: #fff;
|
||||
}
|
||||
#panels a {
|
||||
visibility: hidden;
|
||||
width: 18px;
|
||||
height: 20px;
|
||||
overflow: hidden;
|
||||
display: inline-block;
|
||||
color: #444;
|
||||
margin-left: 6px;
|
||||
}
|
||||
|
||||
#panels a::before {
|
||||
visibility: visible;
|
||||
}
|
||||
|
||||
#panels a:hover {
|
||||
color: #999;
|
||||
}
|
||||
|
||||
#panels a:active {
|
||||
color: #999;
|
||||
margin: 1px 0 -1px 6px;
|
||||
}
|
||||
|
||||
#panels a.active,
|
||||
#panels a.active:hover {
|
||||
color: #999;
|
||||
}
|
||||
.list_item a {
|
||||
color: #999;
|
||||
}
|
||||
|
||||
.list_item.currentChapter > a,
|
||||
.list_item a:hover {
|
||||
color: #333;
|
||||
}
|
||||
|
||||
/* #tocView li.openChapter > a, */
|
||||
.list_item a:hover {
|
||||
color: #333;
|
||||
}
|
||||
#panels {
|
||||
padding-left: 14px;
|
||||
background: #eee;
|
||||
-moz-box-shadow: none;
|
||||
-webkit-box-shadow: none;
|
||||
-ms-box-shadow: none;
|
||||
box-shadow: none; //0px 1px 3px rgba(0,0,0,.3);
|
||||
|
||||
}
|
||||
#divider.show {
|
||||
display: none;
|
||||
}
|
||||
</style>
|
||||
|
||||
<script src="/static/oxjs/min/Ox.js?3"></script>
|
||||
<script src="/static/epub.js/js/libs/jquery.min.js?3"></script>
|
||||
<script src="/static/epub.js/js/libs/zip.min.js?3"></script>
|
||||
<script src="/static/reader/epub.js?3"></script>
|
||||
<!-- Render -->
|
||||
<script src="/static/epub.js/js/epub.js?3"></script>
|
||||
<!-- Reader -->
|
||||
<script src="/static/epub.js/js/reader.js?3"></script>
|
||||
|
||||
<!-- Plugins -->
|
||||
<!-- <script src="js/plugins/search.js"></script> -->
|
||||
<!-- Highlights -->
|
||||
<!-- <script src="/static/epub.js/js/hooks/extensions/highlight.js"></script> -->
|
||||
</head>
|
||||
<body>
|
||||
<div id="sidebar">
|
||||
<div id="panels">
|
||||
<!--
|
||||
<input id="searchBox" placeholder="search" type="search">
|
||||
|
||||
<a id="show-Search" class="show_view icon-search" data-view="Search">Search</a>
|
||||
-->
|
||||
<a id="show-Toc" class="show_view icon-list-1 active" data-view="Toc">TOC</a>
|
||||
<a id="show-Bookmarks" class="show_view icon-bookmark" data-view="Bookmarks">Bookmarks</a>
|
||||
<!--
|
||||
<a id="show-Notes" class="show_view icon-edit" data-view="Notes">Notes</a>
|
||||
-->
|
||||
</div>
|
||||
<div id="tocView" class="view">
|
||||
</div>
|
||||
<div id="searchView" class="view">
|
||||
<ul id="searchResults"></ul>
|
||||
</div>
|
||||
<div id="bookmarksView" class="view">
|
||||
<ul id="bookmarks"></ul>
|
||||
</div>
|
||||
<div id="notesView" class="view">
|
||||
<div id="new-note">
|
||||
<textarea id="note-text"></textarea>
|
||||
<button id="note-anchor">Anchor</button>
|
||||
</div>
|
||||
<ol id="notes"></ol>
|
||||
</div>
|
||||
</div>
|
||||
<div id="main">
|
||||
|
||||
<div id="titlebar">
|
||||
<div id="opener">
|
||||
<a id="slider" class="icon-menu">Menu</a>
|
||||
</div>
|
||||
<div id="metainfo">
|
||||
<span id="book-title"></span>
|
||||
<span id="title-seperator"> – </span>
|
||||
<span id="chapter-title"></span>
|
||||
</div>
|
||||
<div id="title-controls">
|
||||
<a id="bookmark" class="icon-bookmark-empty">Bookmark</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="divider"></div>
|
||||
<div id="prev" class="arrow">‹</div>
|
||||
<div id="viewer"></div>
|
||||
<div id="next" class="arrow">›</div>
|
||||
|
||||
<div id="loader"><img src="/static/epub.js/img/loader.gif"></div>
|
||||
</div>
|
||||
<div class="modal md-effect-1" id="settings-modal">
|
||||
<div class="md-content">
|
||||
<h3>Settings</h3>
|
||||
<div>
|
||||
<p>
|
||||
<input type="checkbox" id="sidebarReflow" name="sidebarReflow">Reflow text when sidebars are open.</input>
|
||||
</p>
|
||||
</div>
|
||||
<div class="closer icon-cancel-circled"></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="overlay"></div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
|
|
@ -53,6 +53,7 @@ urlpatterns += [
|
|||
re_path(r'^resetUI$', user.views.reset_ui),
|
||||
re_path(r'^collection/(?P<id>.*?)/icon(?P<size>\d*).jpg$', documentcollection.views.icon),
|
||||
re_path(r'^documents/(?P<id>[A-Z0-9]+)/(?P<size>\d*)p(?P<page>[\d,]*).jpg$', document.views.thumbnail),
|
||||
re_path(r'^documents/(?P<id>[A-Z0-9]+)/epub/(?P<filename>.*?)$', document.views.epub),
|
||||
re_path(r'^documents/(?P<id>[A-Z0-9]+)/(?P<name>.*?\.[^\d]{3,4})$', document.views.file),
|
||||
re_path(r'^documents/(?P<fragment>.*?)$', document.views.document),
|
||||
re_path(r'^edit/(?P<id>.*?)/icon(?P<size>\d*).jpg$', edit.views.icon),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue