pages can also end with xml
This commit is contained in:
parent
8a92429587
commit
36fc6e7e73
1 changed files with 3 additions and 1 deletions
|
@ -178,7 +178,9 @@ def extract_text(path):
|
|||
for f in z.filelist:
|
||||
if '/._' in f.filename or f.filename.startswith('._'):
|
||||
continue
|
||||
if f.filename.endswith('html'):
|
||||
if 'META-INF' in f.filename:
|
||||
continue
|
||||
if f.filename.split('.')[-1] in ('html', 'xml', 'htm'):
|
||||
data += z.read(f.filename).decode('utf-8', 'ignore')
|
||||
return data
|
||||
|
||||
|
|
Loading…
Reference in a new issue