allow custom getUrl to be passed to getUrlUnicode, error pages can be gziped too

This commit is contained in:
j 2008-05-04 16:08:43 +02:00
parent 5e567665c4
commit 49b47f7a46

View file

@ -1,6 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=2:sts=2:ts=2
# 2008 # 2008
import gzip
import StringIO
import os import os
import sha import sha
import time import time
@ -43,13 +45,16 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
try: try:
url_headers, result = net.getUrl(url, data, headers, returnHeaders=True) url_headers, result = net.getUrl(url, data, headers, returnHeaders=True)
except urllib2.HTTPError, e: except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code
url_headers = dict(e.headers) url_headers = dict(e.headers)
result = e.read() result = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
saveUrlCache(url_cache_file, result, url_headers) saveUrlCache(url_cache_file, result, url_headers)
return result return result
def getUrlUnicode(url): def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl):
data = getUrl(url) data = _getUrl(url, data, headers, timeout)
encoding = chardet.detect(data)['encoding'] encoding = chardet.detect(data)['encoding']
if not encoding: if not encoding:
encoding = 'latin-1' encoding = 'latin-1'