better epub parsing

- dont fail if epubs are invalid zip
- handle quoted filenames
- dont fail if file is missing
This commit is contained in:
j 2015-11-16 16:02:45 +01:00
parent b66f8cd026
commit a24061518a

View file

@ -7,6 +7,7 @@ import xml.etree.ElementTree as ET
import zipfile import zipfile
from io import BytesIO from io import BytesIO
import re import re
from urllib.parse import unquote
from PIL import Image from PIL import Image
import stdnum.isbn import stdnum.isbn
@ -18,34 +19,42 @@ logger = logging.getLogger('oml.media.epub')
def cover(path): def cover(path):
logger.debug('cover %s', path) logger.debug('cover %s', path)
z = zipfile.ZipFile(path)
data = None data = None
try:
z = zipfile.ZipFile(path)
except zipfile.BadZipFile:
logger.debug('invalid epub file %s', path)
return data
for f in z.filelist: for f in z.filelist:
if 'cover' in f.filename.lower() and f.filename.split('.')[-1] in ('jpg', 'jpeg', 'png'): if 'cover' in f.filename.lower() and f.filename.split('.')[-1] in ('jpg', 'jpeg', 'png'):
logger.debug('using %s', f.filename) logger.debug('using %s', f.filename)
data = z.read(f.filename) data = z.read(f.filename)
break break
if not data: if not data:
opf = [f.filename for f in z.filelist if f.filename.endswith('opf')] files = [f.filename for f in z.filelist]
opf = [f for f in files if f.endswith('opf')]
if opf: if opf:
info = ET.fromstring(z.read(opf[0])) info = ET.fromstring(z.read(opf[0]))
manifest = info.findall('{http://www.idpf.org/2007/opf}manifest')[0] manifest = info.findall('{http://www.idpf.org/2007/opf}manifest')[0]
for e in manifest.getchildren(): for e in manifest.getchildren():
if 'image' in e.attrib['media-type']: if 'image' in e.attrib['media-type']:
filename = e.attrib['href'] filename = unquote(e.attrib['href'])
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
data = z.read(filename) if filename in files:
break data = z.read(filename)
break
elif 'html' in e.attrib['media-type']: elif 'html' in e.attrib['media-type']:
filename = e.attrib['href'] filename = unquote(e.attrib['href'])
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
html = z.read(filename).decode('utf-8') html = z.read(filename).decode('utf-8')
img = re.compile('<img.*?src="(.*?)"').findall(html) img = re.compile('<img.*?src="(.*?)"').findall(html)
if img: if img:
img = os.path.normpath(os.path.join(os.path.dirname(filename), img[0])) img = unquote(img[0])
logger.debug('using %s', img) img = os.path.normpath(os.path.join(os.path.dirname(filename), img))
data = z.read(img) if img in files:
break logger.debug('using %s', img)
data = z.read(img)
break
if not data: if not data:
img = Image.new('RGB', (80, 128)) img = Image.new('RGB', (80, 128))
o = BytesIO() o = BytesIO()
@ -56,7 +65,11 @@ def cover(path):
def info(epub): def info(epub):
data = {} data = {}
z = zipfile.ZipFile(epub) try:
z = zipfile.ZipFile(epub)
except zipfile.BadZipFile:
logger.debug('invalid epub file %s', epub)
return data
opf = [f.filename for f in z.filelist if f.filename.endswith('opf')] opf = [f.filename for f in z.filelist if f.filename.endswith('opf')]
if opf: if opf:
info = ET.fromstring(z.read(opf[0])) info = ET.fromstring(z.read(opf[0]))