pages can also end with xml
This commit is contained in:
parent
8a92429587
commit
36fc6e7e73
1 changed files with 3 additions and 1 deletions
|
@ -178,7 +178,9 @@ def extract_text(path):
|
||||||
for f in z.filelist:
|
for f in z.filelist:
|
||||||
if '/._' in f.filename or f.filename.startswith('._'):
|
if '/._' in f.filename or f.filename.startswith('._'):
|
||||||
continue
|
continue
|
||||||
if f.filename.endswith('html'):
|
if 'META-INF' in f.filename:
|
||||||
|
continue
|
||||||
|
if f.filename.split('.')[-1] in ('html', 'xml', 'htm'):
|
||||||
data += z.read(f.filename).decode('utf-8', 'ignore')
|
data += z.read(f.filename).decode('utf-8', 'ignore')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue