diff --git a/ox/cache.py b/ox/cache.py index 3305177..37b5b93 100644 --- a/ox/cache.py +++ b/ox/cache.py @@ -68,6 +68,11 @@ class InvalidResult(Exception): self.result = result self.headers = headers +def _fix_unicode_url(url): + if isinstance(url, unicode): + url = url.encode('utf-8') + return url + def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False): ''' url - url to load @@ -78,29 +83,27 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val if this function fails, InvalidResult will be raised deal with it in your code ''' #FIXME: send last-modified / etag from cache and only update if needed - if isinstance(url, unicode): - url = url.encode('utf-8') - data = store.get(url, data, headers, timeout) - if not data: - #print "get data", url + url = _fix_unicode_url(url) + result = store.get(url, data, headers, timeout) + if not result: try: - url_headers, data = net.read_url(url, data, headers, return_headers=True) + url_headers, result = net.read_url(url, data, headers, return_headers=True) except urllib2.HTTPError, e: e.headers['Status'] = "%s" % e.code url_headers = dict(e.headers) - data = e.read() + result = e.read() if url_headers.get('content-encoding', None) == 'gzip': - data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read() - if not valid or valid(data, url_headers): - store.set(url, data, data, url_headers) + result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read() + if not valid or valid(result, url_headers): + store.set(url, post_data=data, data=result, headers=url_headers) else: - raise InvalidResult(data, url_headers) + raise InvalidResult(result, url_headers) if unicode: - encoding = detect_encoding(data) + encoding = detect_encoding(result) if not encoding: encoding = 'latin-1' - data = data.decode(encoding) - return data + result = result.decode(encoding) + return result def save_url(url, filename, overwrite=False): if not os.path.exists(filename) or overwrite: @@ -169,7 +172,6 @@ class SQLiteCache(Cache): r = None if timeout == 0: return r - if data: url_hash = hashlib.sha1(url + '?' + data).hexdigest() else: diff --git a/ox/net.py b/ox/net.py index 60d6394..390755a 100644 --- a/ox/net.py +++ b/ox/net.py @@ -52,19 +52,19 @@ def open_url(url, data=None, headers=DEFAULT_HEADERS): def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False): f = open_url(url, data, headers) - data = f.read() + result = f.read() f.close() if f.headers.get('content-encoding', None) == 'gzip': - data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read() + result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read() if unicode: - encoding = detect_encoding(data) + encoding = detect_encoding(result) if not encoding: encoding = 'latin-1' - data = data.decode(encoding) + result = result.decode(encoding) if return_headers: f.headers['Status'] = "%s" % f.code - return dict(f.headers), data - return data + return dict(f.headers), result + return result def detect_encoding(data): if 'content="text/html; charset=utf-8"' in data: diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 0da40b4..0fc989d 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -264,7 +264,7 @@ class Imdb(SiteParser): } def read_url(self, url, timeout): - return read_url(url, timeout, unicode=True) + return read_url(url, timeout=timeout, unicode=True) def __init__(self, id, timeout=-1): #use akas.imdb.com to always get original title: