faster way to detect encoding, speeds up getUrlUnicode on large pages

This commit is contained in:
j 2008-06-17 12:53:29 +02:00
parent 22f6b39ad0
commit ff477eda17
2 changed files with 13 additions and 4 deletions

View file

@ -13,7 +13,7 @@ import chardet
import simplejson import simplejson
import net import net
from net import DEFAULT_HEADERS from net import DEFAULT_HEADERS, getEncoding
cache_timeout = 30*24*60*60 # default is 30 days cache_timeout = 30*24*60*60 # default is 30 days
@ -67,7 +67,7 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl): def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl):
data = _getUrl(url, data, headers, timeout) data = _getUrl(url, data, headers, timeout)
encoding = chardet.detect(data)['encoding'] encoding = getEncoding(data)
if not encoding: if not encoding:
encoding = 'latin-1' encoding = 'latin-1'
return unicode(data, encoding) return unicode(data, encoding)

View file

@ -5,7 +5,7 @@ import StringIO
import urllib import urllib
import urllib2 import urllib2
import chardet from chardet.universaldetector import UniversalDetector
# Default headers for HTTP requests. # Default headers for HTTP requests.
@ -57,8 +57,17 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
def getUrlUnicode(url): def getUrlUnicode(url):
data = getUrl(url) data = getUrl(url)
encoding = chardet.detect(data)['encoding'] encoding = getEncoding(data)
if not encoding: if not encoding:
encoding = 'latin-1' encoding = 'latin-1'
return unicode(data, encoding) return unicode(data, encoding)
def getEncoding(data):
detector = UniversalDetector()
for line in data.split('\n'):
detector.feed(line)
if detector.done:
break
detector.close()
return detector.result['encoding']