faster way to detect encoding, speeds up getUrlUnicode on large pages
This commit is contained in:
parent
22f6b39ad0
commit
ff477eda17
2 changed files with 13 additions and 4 deletions
|
@ -13,7 +13,7 @@ import chardet
|
|||
import simplejson
|
||||
|
||||
import net
|
||||
from net import DEFAULT_HEADERS
|
||||
from net import DEFAULT_HEADERS, getEncoding
|
||||
|
||||
|
||||
cache_timeout = 30*24*60*60 # default is 30 days
|
||||
|
@ -67,7 +67,7 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
|||
|
||||
def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl):
|
||||
data = _getUrl(url, data, headers, timeout)
|
||||
encoding = chardet.detect(data)['encoding']
|
||||
encoding = getEncoding(data)
|
||||
if not encoding:
|
||||
encoding = 'latin-1'
|
||||
return unicode(data, encoding)
|
||||
|
|
|
@ -5,7 +5,7 @@ import StringIO
|
|||
import urllib
|
||||
import urllib2
|
||||
|
||||
import chardet
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
|
||||
|
||||
# Default headers for HTTP requests.
|
||||
|
@ -57,8 +57,17 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
|
|||
|
||||
def getUrlUnicode(url):
|
||||
data = getUrl(url)
|
||||
encoding = chardet.detect(data)['encoding']
|
||||
encoding = getEncoding(data)
|
||||
if not encoding:
|
||||
encoding = 'latin-1'
|
||||
return unicode(data, encoding)
|
||||
|
||||
def getEncoding(data):
|
||||
detector = UniversalDetector()
|
||||
for line in data.split('\n'):
|
||||
detector.feed(line)
|
||||
if detector.done:
|
||||
break
|
||||
detector.close()
|
||||
return detector.result['encoding']
|
||||
|
||||
|
|
Loading…
Reference in a new issue