ignore errors for non utf-8 html files

This commit is contained in:
j 2016-01-03 21:00:30 +05:30
parent f8827a2a5c
commit 051b634008
1 changed files with 2 additions and 2 deletions

View File

@ -46,7 +46,7 @@ def cover(path):
elif 'html' in e.attrib['media-type']: elif 'html' in e.attrib['media-type']:
filename = unquote(e.attrib['href']) filename = unquote(e.attrib['href'])
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename)) filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
html = z.read(filename).decode('utf-8') html = z.read(filename).decode('utf-8', 'ignore')
img = re.compile('<img.*?src="(.*?)"').findall(html) img = re.compile('<img.*?src="(.*?)"').findall(html)
#svg image #svg image
img += re.compile('<image.*?href="(.*?)"').findall(html) img += re.compile('<image.*?href="(.*?)"').findall(html)
@ -110,7 +110,7 @@ def extract_text(path):
if '/._' in f.filename or f.filename.startswith('._'): if '/._' in f.filename or f.filename.startswith('._'):
continue continue
if f.filename.endswith('html'): if f.filename.endswith('html'):
data += z.read(f.filename).decode() data += z.read(f.filename).decode('utf-8', 'ignore')
return data return data
def extract_isbn(data): def extract_isbn(data):