faster way to detect encoding, speeds up getUrlUnicode on large pages
This commit is contained in:
parent
22f6b39ad0
commit
ff477eda17
2 changed files with 13 additions and 4 deletions
|
@ -13,7 +13,7 @@ import chardet
|
||||||
import simplejson
|
import simplejson
|
||||||
|
|
||||||
import net
|
import net
|
||||||
from net import DEFAULT_HEADERS
|
from net import DEFAULT_HEADERS, getEncoding
|
||||||
|
|
||||||
|
|
||||||
cache_timeout = 30*24*60*60 # default is 30 days
|
cache_timeout = 30*24*60*60 # default is 30 days
|
||||||
|
@ -67,7 +67,7 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||||
|
|
||||||
def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl):
|
def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl):
|
||||||
data = _getUrl(url, data, headers, timeout)
|
data = _getUrl(url, data, headers, timeout)
|
||||||
encoding = chardet.detect(data)['encoding']
|
encoding = getEncoding(data)
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = 'latin-1'
|
encoding = 'latin-1'
|
||||||
return unicode(data, encoding)
|
return unicode(data, encoding)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import StringIO
|
||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
|
|
||||||
import chardet
|
from chardet.universaldetector import UniversalDetector
|
||||||
|
|
||||||
|
|
||||||
# Default headers for HTTP requests.
|
# Default headers for HTTP requests.
|
||||||
|
@ -57,8 +57,17 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
|
||||||
|
|
||||||
def getUrlUnicode(url):
|
def getUrlUnicode(url):
|
||||||
data = getUrl(url)
|
data = getUrl(url)
|
||||||
encoding = chardet.detect(data)['encoding']
|
encoding = getEncoding(data)
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = 'latin-1'
|
encoding = 'latin-1'
|
||||||
return unicode(data, encoding)
|
return unicode(data, encoding)
|
||||||
|
|
||||||
|
def getEncoding(data):
|
||||||
|
detector = UniversalDetector()
|
||||||
|
for line in data.split('\n'):
|
||||||
|
detector.feed(line)
|
||||||
|
if detector.done:
|
||||||
|
break
|
||||||
|
detector.close()
|
||||||
|
return detector.result['encoding']
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue