diff --git a/ox/web/impawards.py b/ox/web/impawards.py
index f11ca12..868e784 100644
--- a/ox/web/impawards.py
+++ b/ox/web/impawards.py
@@ -3,11 +3,19 @@
from __future__ import print_function
import re
-from ox.cache import read_url
+import ox.cache
from ox.html import strip_tags
from ox.text import find_re
+def read_url(url, timeout=ox.cache.DEFAULT_TIMEOUT):
+ data = ox.cache.read_url(url, timeout=timeout)
+ try:
+ data = data.decode('utf-8')
+ except UnicodeDecodeError:
+ data = data.decode('latin-1')
+ return data
+
def get_data(id):
'''
>>> str(get_data('1991/silence_of_the_lambs')['imdbId'])
@@ -22,7 +30,7 @@ def get_data(id):
data = {
'url': get_url(id)
}
- html = read_url(data['url'], unicode=True)
+ html = read_url(data['url'])
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '')
@@ -37,11 +45,11 @@ def get_data(id):
for result in results:
result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
- html = read_url(url, unicode=True)
+ html = read_url(url)
result = find_re(html, '', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
ids.append(get_id(url))
return set(ids)
# get all
- html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60, unicode=True)
+ html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60)
pages = int(find_re(html, '')) + 1
for page in range(pages, 0, -1):
for id in get_ids(page):
@@ -81,7 +89,7 @@ def get_ids(page=None):
def get_url(id):
url = u"http://www.impawards.com/%s.html" % id
- html = read_url(url, unicode=True)
+ html = read_url(url)
if find_re(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id
return url