fix ox.cache.read_url
This commit is contained in:
parent
a4fd3c930f
commit
62f5e84642
3 changed files with 24 additions and 22 deletions
32
ox/cache.py
32
ox/cache.py
|
@ -68,6 +68,11 @@ class InvalidResult(Exception):
|
||||||
self.result = result
|
self.result = result
|
||||||
self.headers = headers
|
self.headers = headers
|
||||||
|
|
||||||
|
def _fix_unicode_url(url):
|
||||||
|
if isinstance(url, unicode):
|
||||||
|
url = url.encode('utf-8')
|
||||||
|
return url
|
||||||
|
|
||||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
|
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
|
||||||
'''
|
'''
|
||||||
url - url to load
|
url - url to load
|
||||||
|
@ -78,29 +83,27 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
|
||||||
if this function fails, InvalidResult will be raised deal with it in your code
|
if this function fails, InvalidResult will be raised deal with it in your code
|
||||||
'''
|
'''
|
||||||
#FIXME: send last-modified / etag from cache and only update if needed
|
#FIXME: send last-modified / etag from cache and only update if needed
|
||||||
if isinstance(url, unicode):
|
url = _fix_unicode_url(url)
|
||||||
url = url.encode('utf-8')
|
result = store.get(url, data, headers, timeout)
|
||||||
data = store.get(url, data, headers, timeout)
|
if not result:
|
||||||
if not data:
|
|
||||||
#print "get data", url
|
|
||||||
try:
|
try:
|
||||||
url_headers, data = net.read_url(url, data, headers, return_headers=True)
|
url_headers, result = net.read_url(url, data, headers, return_headers=True)
|
||||||
except urllib2.HTTPError, e:
|
except urllib2.HTTPError, e:
|
||||||
e.headers['Status'] = "%s" % e.code
|
e.headers['Status'] = "%s" % e.code
|
||||||
url_headers = dict(e.headers)
|
url_headers = dict(e.headers)
|
||||||
data = e.read()
|
result = e.read()
|
||||||
if url_headers.get('content-encoding', None) == 'gzip':
|
if url_headers.get('content-encoding', None) == 'gzip':
|
||||||
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
|
||||||
if not valid or valid(data, url_headers):
|
if not valid or valid(result, url_headers):
|
||||||
store.set(url, data, data, url_headers)
|
store.set(url, post_data=data, data=result, headers=url_headers)
|
||||||
else:
|
else:
|
||||||
raise InvalidResult(data, url_headers)
|
raise InvalidResult(result, url_headers)
|
||||||
if unicode:
|
if unicode:
|
||||||
encoding = detect_encoding(data)
|
encoding = detect_encoding(result)
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = 'latin-1'
|
encoding = 'latin-1'
|
||||||
data = data.decode(encoding)
|
result = result.decode(encoding)
|
||||||
return data
|
return result
|
||||||
|
|
||||||
def save_url(url, filename, overwrite=False):
|
def save_url(url, filename, overwrite=False):
|
||||||
if not os.path.exists(filename) or overwrite:
|
if not os.path.exists(filename) or overwrite:
|
||||||
|
@ -169,7 +172,6 @@ class SQLiteCache(Cache):
|
||||||
r = None
|
r = None
|
||||||
if timeout == 0:
|
if timeout == 0:
|
||||||
return r
|
return r
|
||||||
|
|
||||||
if data:
|
if data:
|
||||||
url_hash = hashlib.sha1(url + '?' + data).hexdigest()
|
url_hash = hashlib.sha1(url + '?' + data).hexdigest()
|
||||||
else:
|
else:
|
||||||
|
|
12
ox/net.py
12
ox/net.py
|
@ -52,19 +52,19 @@ def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
||||||
|
|
||||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
|
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
|
||||||
f = open_url(url, data, headers)
|
f = open_url(url, data, headers)
|
||||||
data = f.read()
|
result = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
if f.headers.get('content-encoding', None) == 'gzip':
|
if f.headers.get('content-encoding', None) == 'gzip':
|
||||||
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
|
||||||
if unicode:
|
if unicode:
|
||||||
encoding = detect_encoding(data)
|
encoding = detect_encoding(result)
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = 'latin-1'
|
encoding = 'latin-1'
|
||||||
data = data.decode(encoding)
|
result = result.decode(encoding)
|
||||||
if return_headers:
|
if return_headers:
|
||||||
f.headers['Status'] = "%s" % f.code
|
f.headers['Status'] = "%s" % f.code
|
||||||
return dict(f.headers), data
|
return dict(f.headers), result
|
||||||
return data
|
return result
|
||||||
|
|
||||||
def detect_encoding(data):
|
def detect_encoding(data):
|
||||||
if 'content="text/html; charset=utf-8"' in data:
|
if 'content="text/html; charset=utf-8"' in data:
|
||||||
|
|
|
@ -264,7 +264,7 @@ class Imdb(SiteParser):
|
||||||
}
|
}
|
||||||
|
|
||||||
def read_url(self, url, timeout):
|
def read_url(self, url, timeout):
|
||||||
return read_url(url, timeout, unicode=True)
|
return read_url(url, timeout=timeout, unicode=True)
|
||||||
|
|
||||||
def __init__(self, id, timeout=-1):
|
def __init__(self, id, timeout=-1):
|
||||||
#use akas.imdb.com to always get original title:
|
#use akas.imdb.com to always get original title:
|
||||||
|
|
Loading…
Reference in a new issue