diff --git a/oxutils/cache.py b/oxutils/cache.py index 00030dc..c6bff37 100644 --- a/oxutils/cache.py +++ b/oxutils/cache.py @@ -13,7 +13,7 @@ import chardet import simplejson import net -from net import DEFAULT_HEADERS +from net import DEFAULT_HEADERS, getEncoding cache_timeout = 30*24*60*60 # default is 30 days @@ -67,7 +67,7 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl): data = _getUrl(url, data, headers, timeout) - encoding = chardet.detect(data)['encoding'] + encoding = getEncoding(data) if not encoding: encoding = 'latin-1' return unicode(data, encoding) diff --git a/oxutils/net.py b/oxutils/net.py index 5f4fe0e..eeb3aac 100644 --- a/oxutils/net.py +++ b/oxutils/net.py @@ -5,7 +5,7 @@ import StringIO import urllib import urllib2 -import chardet +from chardet.universaldetector import UniversalDetector # Default headers for HTTP requests. @@ -57,8 +57,17 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False): def getUrlUnicode(url): data = getUrl(url) - encoding = chardet.detect(data)['encoding'] + encoding = getEncoding(data) if not encoding: encoding = 'latin-1' return unicode(data, encoding) +def getEncoding(data): + detector = UniversalDetector() + for line in data.split('\n'): + detector.feed(line) + if detector.done: + break + detector.close() + return detector.result['encoding'] +