pages can also end with xml

This commit is contained in:
j 2016-01-28 17:53:50 +05:30
parent 8a92429587
commit 36fc6e7e73

View file

@ -178,7 +178,9 @@ def extract_text(path):
for f in z.filelist:
if '/._' in f.filename or f.filename.startswith('._'):
continue
if f.filename.endswith('html'):
if 'META-INF' in f.filename:
continue
if f.filename.split('.')[-1] in ('html', 'xml', 'htm'):
data += z.read(f.filename).decode('utf-8', 'ignore')
return data