From ff477eda176335ed6dabd96ed96d74107a4b1d88 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 17 Jun 2008 12:53:29 +0200 Subject: [PATCH] faster way to detect encoding, speeds up getUrlUnicode on large pages --- oxutils/cache.py | 4 ++-- oxutils/net.py | 13 +++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/oxutils/cache.py b/oxutils/cache.py index 00030dc..c6bff37 100644 --- a/oxutils/cache.py +++ b/oxutils/cache.py @@ -13,7 +13,7 @@ import chardet import simplejson import net -from net import DEFAULT_HEADERS +from net import DEFAULT_HEADERS, getEncoding cache_timeout = 30*24*60*60 # default is 30 days @@ -67,7 +67,7 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl): data = _getUrl(url, data, headers, timeout) - encoding = chardet.detect(data)['encoding'] + encoding = getEncoding(data) if not encoding: encoding = 'latin-1' return unicode(data, encoding) diff --git a/oxutils/net.py b/oxutils/net.py index 5f4fe0e..eeb3aac 100644 --- a/oxutils/net.py +++ b/oxutils/net.py @@ -5,7 +5,7 @@ import StringIO import urllib import urllib2 -import chardet +from chardet.universaldetector import UniversalDetector # Default headers for HTTP requests. @@ -57,8 +57,17 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False): def getUrlUnicode(url): data = getUrl(url) - encoding = chardet.detect(data)['encoding'] + encoding = getEncoding(data) if not encoding: encoding = 'latin-1' return unicode(data, encoding) +def getEncoding(data): + detector = UniversalDetector() + for line in data.split('\n'): + detector.feed(line) + if detector.done: + break + detector.close() + return detector.result['encoding'] +