faster and more reliable encoding detection of html content
This commit is contained in:
parent
3165e3a8b1
commit
f535b82e7b
1 changed files with 7 additions and 5 deletions
12
ox/net.py
12
ox/net.py
|
@ -3,6 +3,7 @@
|
||||||
# GPL 2008
|
# GPL 2008
|
||||||
import os
|
import os
|
||||||
import gzip
|
import gzip
|
||||||
|
import re
|
||||||
import StringIO
|
import StringIO
|
||||||
import struct
|
import struct
|
||||||
import urllib
|
import urllib
|
||||||
|
@ -70,11 +71,12 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unic
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def detect_encoding(data):
|
def detect_encoding(data):
|
||||||
if 'content="text/html; charset=utf-8"' in data.lower() or \
|
data_lower = data.lower()
|
||||||
'meta charset="utf-8"' in data.lower():
|
charset = re.compile('content="text/html; charset=(.*?)"').findall(data)
|
||||||
return 'utf-8'
|
if not charset:
|
||||||
elif 'content="text/html; charset=iso-8859-1"' in data:
|
charset = re.compile('meta charset="(.*?)"').findall(data)
|
||||||
return 'iso-8859-1'
|
if charset:
|
||||||
|
return charset[0].lower()
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector()
|
||||||
for line in data.split('\n'):
|
for line in data.split('\n'):
|
||||||
detector.feed(line)
|
detector.feed(line)
|
||||||
|
|
Loading…
Reference in a new issue