fix ox.cache.read_url

This commit is contained in:
j 2012-08-17 22:20:35 +02:00
parent a4fd3c930f
commit 62f5e84642
3 changed files with 24 additions and 22 deletions

View file

@ -68,6 +68,11 @@ class InvalidResult(Exception):
self.result = result self.result = result
self.headers = headers self.headers = headers
def _fix_unicode_url(url):
if isinstance(url, unicode):
url = url.encode('utf-8')
return url
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False): def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
''' '''
url - url to load url - url to load
@ -78,29 +83,27 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
if this function fails, InvalidResult will be raised deal with it in your code if this function fails, InvalidResult will be raised deal with it in your code
''' '''
#FIXME: send last-modified / etag from cache and only update if needed #FIXME: send last-modified / etag from cache and only update if needed
if isinstance(url, unicode): url = _fix_unicode_url(url)
url = url.encode('utf-8') result = store.get(url, data, headers, timeout)
data = store.get(url, data, headers, timeout) if not result:
if not data:
#print "get data", url
try: try:
url_headers, data = net.read_url(url, data, headers, return_headers=True) url_headers, result = net.read_url(url, data, headers, return_headers=True)
except urllib2.HTTPError, e: except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code e.headers['Status'] = "%s" % e.code
url_headers = dict(e.headers) url_headers = dict(e.headers)
data = e.read() result = e.read()
if url_headers.get('content-encoding', None) == 'gzip': if url_headers.get('content-encoding', None) == 'gzip':
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read() result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
if not valid or valid(data, url_headers): if not valid or valid(result, url_headers):
store.set(url, data, data, url_headers) store.set(url, post_data=data, data=result, headers=url_headers)
else: else:
raise InvalidResult(data, url_headers) raise InvalidResult(result, url_headers)
if unicode: if unicode:
encoding = detect_encoding(data) encoding = detect_encoding(result)
if not encoding: if not encoding:
encoding = 'latin-1' encoding = 'latin-1'
data = data.decode(encoding) result = result.decode(encoding)
return data return result
def save_url(url, filename, overwrite=False): def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite: if not os.path.exists(filename) or overwrite:
@ -169,7 +172,6 @@ class SQLiteCache(Cache):
r = None r = None
if timeout == 0: if timeout == 0:
return r return r
if data: if data:
url_hash = hashlib.sha1(url + '?' + data).hexdigest() url_hash = hashlib.sha1(url + '?' + data).hexdigest()
else: else:

View file

@ -52,19 +52,19 @@ def open_url(url, data=None, headers=DEFAULT_HEADERS):
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False): def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
f = open_url(url, data, headers) f = open_url(url, data, headers)
data = f.read() result = f.read()
f.close() f.close()
if f.headers.get('content-encoding', None) == 'gzip': if f.headers.get('content-encoding', None) == 'gzip':
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read() result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
if unicode: if unicode:
encoding = detect_encoding(data) encoding = detect_encoding(result)
if not encoding: if not encoding:
encoding = 'latin-1' encoding = 'latin-1'
data = data.decode(encoding) result = result.decode(encoding)
if return_headers: if return_headers:
f.headers['Status'] = "%s" % f.code f.headers['Status'] = "%s" % f.code
return dict(f.headers), data return dict(f.headers), result
return data return result
def detect_encoding(data): def detect_encoding(data):
if 'content="text/html; charset=utf-8"' in data: if 'content="text/html; charset=utf-8"' in data:

View file

@ -264,7 +264,7 @@ class Imdb(SiteParser):
} }
def read_url(self, url, timeout): def read_url(self, url, timeout):
return read_url(url, timeout, unicode=True) return read_url(url, timeout=timeout, unicode=True)
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
#use akas.imdb.com to always get original title: #use akas.imdb.com to always get original title: