From bf6c512d3e782a9a78005fb4a8fc678b5fdf1bd4 Mon Sep 17 00:00:00 2001 From: j Date: Wed, 1 Aug 2018 11:13:13 +0200 Subject: [PATCH] read_url --- ox/web/impawards.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/ox/web/impawards.py b/ox/web/impawards.py index f11ca12..868e784 100644 --- a/ox/web/impawards.py +++ b/ox/web/impawards.py @@ -3,11 +3,19 @@ from __future__ import print_function import re -from ox.cache import read_url +import ox.cache from ox.html import strip_tags from ox.text import find_re +def read_url(url, timeout=ox.cache.DEFAULT_TIMEOUT): + data = ox.cache.read_url(url, timeout=timeout) + try: + data = data.decode('utf-8') + except UnicodeDecodeError: + data = data.decode('latin-1') + return data + def get_data(id): ''' >>> str(get_data('1991/silence_of_the_lambs')['imdbId']) @@ -22,7 +30,7 @@ def get_data(id): data = { 'url': get_url(id) } - html = read_url(data['url'], unicode=True) + html = read_url(data['url']) data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})') if not data['imdbId']: data['imdbId'] = _id_map.get(id, '') @@ -37,11 +45,11 @@ def get_data(id): for result in results: result = result.replace('_xlg.html', '.html') url = 'http://www.impawards.com/%s/%s' % (data['year'], result) - html = read_url(url, unicode=True) + html = read_url(url) result = find_re(html, '', re.DOTALL).findall(html) for result in results: url = 'http://impawards.com/%s' % result ids.append(get_id(url)) return set(ids) # get all - html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60, unicode=True) + html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60) pages = int(find_re(html, '')) + 1 for page in range(pages, 0, -1): for id in get_ids(page): @@ -81,7 +89,7 @@ def get_ids(page=None): def get_url(id): url = u"http://www.impawards.com/%s.html" % id - html = read_url(url, unicode=True) + html = read_url(url) if find_re(html, "No Movie Posters on This Page"): url = u"http://www.impawards.com/%s_ver1.html" % id return url