minimal support for epub documents

This commit is contained in:
j 2025-06-21 08:29:19 +02:00
commit 3c69c0c101
50 changed files with 30209 additions and 10 deletions

189
pandora/document/epub.py Normal file
View file

@ -0,0 +1,189 @@
import os
import xml.etree.ElementTree as ET
import zipfile
import re
from urllib.parse import unquote
import lxml.html
from io import BytesIO
from PIL import Image
from ox import strip_tags, decode_html, normalize_name
import logging
logging.getLogger('PIL').setLevel(logging.ERROR)
logger = logging.getLogger(__name__)
def get_ratio(data):
try:
img = Image.open(BytesIO(data))
return img.size[0]/img.size[1]
except:
return -1
def normpath(path):
return '/'.join(os.path.normpath(path).split(os.sep))
def cover(path):
logger.debug('cover %s', path)
data = None
try:
z = zipfile.ZipFile(path)
except zipfile.BadZipFile:
logger.debug('invalid epub file %s', path)
return data
def use(filename):
logger.debug('using %s', filename)
try:
data = z.read(filename)
except:
return None
r = get_ratio(data)
if r < 0.3 or r > 2:
return None
return data
files = []
for f in z.filelist:
if f.filename == 'calibre-logo.png':
continue
if 'cover' in f.filename.lower() and f.filename.split('.')[-1] in ('jpg', 'jpeg', 'png'):
return use(f.filename)
files.append(f.filename)
opf = [f for f in files if f.endswith('opf')]
if opf:
#logger.debug('opf: %s', z.read(opf[0]).decode())
info = ET.fromstring(z.read(opf[0]))
metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')
if metadata:
metadata = metadata[0]
manifest = info.findall('{http://www.idpf.org/2007/opf}manifest')
if manifest:
manifest = manifest[0]
if metadata and manifest:
for e in list(metadata):
if e.tag == '{http://www.idpf.org/2007/opf}meta' and e.attrib.get('name') == 'cover':
cover_id = e.attrib['content']
for e in list(manifest):
if e.attrib['id'] == cover_id:
filename = unquote(e.attrib['href'])
filename = normpath(os.path.join(os.path.dirname(opf[0]), filename))
if filename in files:
return use(filename)
if manifest:
images = [e for e in list(manifest) if 'image' in e.attrib['media-type']]
if images:
image_data = []
for e in images:
filename = unquote(e.attrib['href'])
filename = normpath(os.path.join(os.path.dirname(opf[0]), filename))
if filename in files:
image_data.append(filename)
if image_data:
image_data.sort(key=lambda name: z.getinfo(name).file_size)
return use(image_data[-1])
for e in list(manifest):
if 'html' in e.attrib['media-type']:
filename = unquote(e.attrib['href'])
filename = normpath(os.path.join(os.path.dirname(opf[0]), filename))
html = z.read(filename).decode('utf-8', 'ignore')
img = re.compile('<img.*?src="(.*?)"').findall(html)
#svg image
img += re.compile('<image.*?href="(.*?)"').findall(html)
if img:
img = unquote(img[0])
img = normpath(os.path.join(os.path.dirname(filename), img))
if img in files:
return use(img)
return data
def info(epub):
data = {}
try:
z = zipfile.ZipFile(epub)
except zipfile.BadZipFile:
logger.debug('invalid epub file %s', epub)
return data
files = [f.filename for f in z.filelist]
opf = [f for f in files if f.endswith('opf')]
if opf:
info = ET.fromstring(z.read(opf[0]))
metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')
if metadata:
metadata = metadata[0]
for e in list(metadata):
if e.text and e.text.strip() and e.text not in ('unknown', 'none'):
key = e.tag.split('}')[-1]
key = {
'creator': 'author',
}.get(key, key)
value = e.text.strip()
if key == 'identifier':
if value:
data['isbn'] = value
elif key == 'author':
data[key] = value.split(', ')
if len(data[key]) == 2 and max(len(d.split(' ')) for d in data[key]) == 1:
data[key] = [normalize_name(', '.join(data[key]))]
else:
data[key] = value
toc = [f for f in files if 'toc.ncx' in f]
if toc:
try:
_toc = ET.fromstring(z.read(toc[0]))
nav_map = _toc.find('{http://www.daisy.org/z3986/2005/ncx/}navMap')
except:
logger.debug('failed to parse toc', exc_info=True)
nav_map = None
if nav_map:
contents = []
for point in nav_map.findall('{http://www.daisy.org/z3986/2005/ncx/}navPoint'):
label = point.find('{http://www.daisy.org/z3986/2005/ncx/}navLabel')
if label:
txt = list(label)[0].text
if txt:
contents.append(txt)
if contents:
data['tableofcontents'] = '\n'.join(contents).strip()
if 'tableofcontents' not in data:
guide = info.find('{http://www.idpf.org/2007/opf}guide')
if guide:
for ref in guide.findall('{http://www.idpf.org/2007/opf}reference'):
if ref.attrib.get('type') == 'toc':
filename = unquote(ref.attrib['href']).split('#')[0]
filename = normpath(os.path.join(os.path.dirname(opf[0]), filename))
if filename in files:
toc = z.read(filename)
if toc:
doc = lxml.html.document_fromstring(toc)
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')]).strip()
if 'description' in data:
data['description'] = strip_tags(decode_html(data['description']))
text = extract_text(epub)
data['textsize'] = len(text)
if 'date' in data and 'T' in data['date']:
data['date'] = data['date'].split('T')[0]
if 'language' in data and isinstance(data['language'], str):
data['language'] = get_language(data['language'])
for key in list(data):
if isinstance(data[key], str) and not data[key].strip():
del data[key]
return data
def extract_text(path):
data = ''
z = zipfile.ZipFile(path)
for f in z.filelist:
if '/._' in f.filename or f.filename.startswith('._'):
continue
if 'META-INF' in f.filename:
continue
if f.filename.split('.')[-1] in ('html', 'xml', 'htm'):
data += z.read(f.filename).decode('utf-8', 'ignore')
return data

View file

@ -5,6 +5,8 @@ import tempfile
from django.conf import settings
from . import epub
logger = logging.getLogger('pandora.' + __name__)
@ -55,6 +57,8 @@ class FulltextMixin:
if self.file:
if self.extension == 'pdf':
return extract_text(self.file.path)
elif self.extension == 'epub':
return epub.extract_text(self.file.path)
elif self.extension in IMAGE_EXTENSIONS:
return ocr_image(self.file.path)
elif self.extension in CONVERT_EXTENSIONS:
@ -184,6 +188,9 @@ class FulltextPageMixin(FulltextMixin):
if self.document.file:
if self.document.extension == 'pdf':
return extract_text(self.document.file.path, self.page)
elif self.extension == 'epub':
# FIXME: is there a nice way to split that into pages
return epub.extract_text(self.file.path)
elif self.extension in IMAGE_EXTENSIONS:
return ocr_image(self.document.file.path)
elif self.extension == 'html':

View file

@ -30,6 +30,7 @@ from user.utils import update_groups
from . import managers
from . import utils
from . import tasks
from . import epub
from .fulltext import FulltextMixin, FulltextPageMixin
User = get_user_model()
@ -174,13 +175,15 @@ class Document(models.Model, FulltextMixin):
if self.extension == 'pdf':
prefix = 2
value = self.pages
elif self.extension == 'epub':
prefix = 3
value = self.pages
elif self.extension == 'html':
prefix = 1
value = self.dimensions
else:
if self.extension == 'html':
prefix = 1
value = self.dimensions
else:
prefix = 0
value = self.width * self.height
prefix = 0
value = self.width * self.height
if value < 0:
value = 0
s.dimensions = ox.sort_string('%d' % prefix) + ox.sort_string('%d' % value)
@ -390,7 +393,7 @@ class Document(models.Model, FulltextMixin):
@property
def dimensions(self):
if self.extension == 'pdf':
if self.extension in ('pdf', 'epub'):
return self.pages
elif self.extension == 'html':
return len(self.data.get('text', '').split(' '))
@ -564,6 +567,13 @@ class Document(models.Model, FulltextMixin):
path = os.path.join(folder, '%dp%d,%s.jpg' % (size, page, ','.join(map(str, crop))))
if not os.path.exists(path):
resize_image(src, path, size=size)
elif self.extension == 'epub':
path = os.path.join(folder, '1024.jpg')
if os.path.exists(src) and not os.path.exists(path):
data = epub.cover(src)
if data:
with open(path, "wb") as fd:
fd.write(data)
elif self.extension in ('jpg', 'png', 'gif', 'webp', 'heic', 'heif', 'cr2'):
if os.path.exists(src):
if size and page:
@ -607,17 +617,24 @@ class Document(models.Model, FulltextMixin):
self.width = -1
self.height = -1
self.pages = utils.pdfpages(self.file.path)
elif self.extension == 'epub':
thumb = self.thumbnail(1024)
if thumb:
self.width, self.height = open_image_rgb(thumb).size
self.pages = 1
elif self.width == -1:
self.pages = -1
self.width, self.height = open_image_rgb(self.file.path).size
def get_ratio(self):
if self.extension == 'pdf':
if self.extension in ('pdf', 'epub'):
image = self.thumbnail(1024)
try:
size = Image.open(image).size
except:
size = [1, 1]
elif self.extension == 'epub':
size = [1, 1]
else:
if self.width > 0:
size = self.resolution

View file

@ -5,6 +5,7 @@ import mimetypes
import os
import re
import unicodedata
import zipfile
import ox
from ox.utils import json
@ -15,7 +16,7 @@ from oxdjango.shortcuts import render_to_json_response, get_object_or_404_json,
from django import forms
from django.conf import settings
from django.db.models import Count, Sum
from django.http import HttpResponse
from django.http import HttpResponse, Http404
from django.shortcuts import render
from item import utils
@ -557,3 +558,24 @@ def document(request, fragment):
context['url'] = request.build_absolute_uri('/documents/' + fragment)
context['settings'] = settings
return render(request, "document.html", context)
def epub(request, id, filename):
document = get_document_or_404_json(request, id)
if not document.access(request.user):
raise Http404
if document.extension != 'epub':
raise Http404
z = zipfile.ZipFile(document.file.path)
if filename == '':
context = {}
context["epub"] = document
return render(request, "epub.html", context)
elif filename not in [f.filename for f in z.filelist]:
raise Http404
else:
content_type = {
'xpgt': 'application/vnd.adobe-page-template+xml'
}.get(filename.split('.')[0], mimetypes.guess_type(filename)[0]) or 'text/plain'
content = z.read(filename)
response = HttpResponse(content, content_type=content_type)
return response

176
pandora/templates/epub.html Normal file
View file

@ -0,0 +1,176 @@
<!DOCTYPE html>
<html class="no-js">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title></title>
<meta name="description" content="">
<meta name="viewport" content="width=device-width, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes">
<link rel="stylesheet" href="/static/epub.js/css/normalize.css?3">
<link rel="stylesheet" href="/static/epub.js/css/main.css?3">
<link rel="stylesheet" href="/static/epub.js/css/popup.css?3">
<link rel="stylesheet" href="/static/epub.js/css/annotations.css?3">
<style>
.arrow {
-webkit-user-select: none;
-moz-user-select: none;
-o-user-select: none;
-ms-user-select: none;
user-select: text;
}
#metainfo {
display: none !important;
}
#main {
border-radius: 0px;
-webkit-transition: -webkit-transform .4s, width .2s;
-moz-transition: -webkit-transform .4s, width .2s;
-ms-transition: -webkit-transform .4s, width .2s;
-moz-box-shadow: none;
-webkit-box-shadow: none;
-ms-box-shadow: none;
box-shadow: none;
}
#sidebar {
background: #fff;
}
#panels a {
visibility: hidden;
width: 18px;
height: 20px;
overflow: hidden;
display: inline-block;
color: #444;
margin-left: 6px;
}
#panels a::before {
visibility: visible;
}
#panels a:hover {
color: #999;
}
#panels a:active {
color: #999;
margin: 1px 0 -1px 6px;
}
#panels a.active,
#panels a.active:hover {
color: #999;
}
.list_item a {
color: #999;
}
.list_item.currentChapter > a,
.list_item a:hover {
color: #333;
}
/* #tocView li.openChapter > a, */
.list_item a:hover {
color: #333;
}
#panels {
padding-left: 14px;
background: #eee;
-moz-box-shadow: none;
-webkit-box-shadow: none;
-ms-box-shadow: none;
box-shadow: none; //0px 1px 3px rgba(0,0,0,.3);
}
#divider.show {
display: none;
}
</style>
<script src="/static/oxjs/min/Ox.js?3"></script>
<script src="/static/epub.js/js/libs/jquery.min.js?3"></script>
<script src="/static/epub.js/js/libs/zip.min.js?3"></script>
<script src="/static/reader/epub.js?3"></script>
<!-- Render -->
<script src="/static/epub.js/js/epub.js?3"></script>
<!-- Reader -->
<script src="/static/epub.js/js/reader.js?3"></script>
<!-- Plugins -->
<!-- <script src="js/plugins/search.js"></script> -->
<!-- Highlights -->
<!-- <script src="/static/epub.js/js/hooks/extensions/highlight.js"></script> -->
</head>
<body>
<div id="sidebar">
<div id="panels">
<!--
<input id="searchBox" placeholder="search" type="search">
<a id="show-Search" class="show_view icon-search" data-view="Search">Search</a>
-->
<a id="show-Toc" class="show_view icon-list-1 active" data-view="Toc">TOC</a>
<a id="show-Bookmarks" class="show_view icon-bookmark" data-view="Bookmarks">Bookmarks</a>
<!--
<a id="show-Notes" class="show_view icon-edit" data-view="Notes">Notes</a>
-->
</div>
<div id="tocView" class="view">
</div>
<div id="searchView" class="view">
<ul id="searchResults"></ul>
</div>
<div id="bookmarksView" class="view">
<ul id="bookmarks"></ul>
</div>
<div id="notesView" class="view">
<div id="new-note">
<textarea id="note-text"></textarea>
<button id="note-anchor">Anchor</button>
</div>
<ol id="notes"></ol>
</div>
</div>
<div id="main">
<div id="titlebar">
<div id="opener">
<a id="slider" class="icon-menu">Menu</a>
</div>
<div id="metainfo">
<span id="book-title"></span>
<span id="title-seperator">&nbsp;&nbsp;&nbsp;&nbsp;</span>
<span id="chapter-title"></span>
</div>
<div id="title-controls">
<a id="bookmark" class="icon-bookmark-empty">Bookmark</a>
</div>
</div>
<div id="divider"></div>
<div id="prev" class="arrow"></div>
<div id="viewer"></div>
<div id="next" class="arrow"></div>
<div id="loader"><img src="/static/epub.js/img/loader.gif"></div>
</div>
<div class="modal md-effect-1" id="settings-modal">
<div class="md-content">
<h3>Settings</h3>
<div>
<p>
<input type="checkbox" id="sidebarReflow" name="sidebarReflow">Reflow text when sidebars are open.</input>
</p>
</div>
<div class="closer icon-cancel-circled"></div>
</div>
</div>
<div class="overlay"></div>
</body>
</html>

View file

@ -53,6 +53,7 @@ urlpatterns += [
re_path(r'^resetUI$', user.views.reset_ui),
re_path(r'^collection/(?P<id>.*?)/icon(?P<size>\d*).jpg$', documentcollection.views.icon),
re_path(r'^documents/(?P<id>[A-Z0-9]+)/(?P<size>\d*)p(?P<page>[\d,]*).jpg$', document.views.thumbnail),
re_path(r'^documents/(?P<id>[A-Z0-9]+)/epub/(?P<filename>.*?)$', document.views.epub),
re_path(r'^documents/(?P<id>[A-Z0-9]+)/(?P<name>.*?\.[^\d]{3,4})$', document.views.file),
re_path(r'^documents/(?P<fragment>.*?)$', document.views.document),
re_path(r'^edit/(?P<id>.*?)/icon(?P<size>\d*).jpg$', edit.views.icon),