read_url

2018-08-01 11:13:13 +02:00 · 2018-08-01 11:13:13 +02:00 · bf6c512d3e
commit bf6c512d3e
parent 7041d1b316
1 changed files with 15 additions and 7 deletions
--- a/ox/web/impawards.py
+++ b/ox/web/impawards.py
@ -3,11 +3,19 @@
 from __future__ import print_function
 import re

-from ox.cache import read_url
+import ox.cache
 from ox.html import strip_tags
 from ox.text import find_re


+def read_url(url, timeout=ox.cache.DEFAULT_TIMEOUT):
+    data = ox.cache.read_url(url, timeout=timeout)
+    try:
+        data = data.decode('utf-8')
+    except UnicodeDecodeError:
+        data = data.decode('latin-1')
+    return data
+
 def get_data(id):
    '''
    >>> str(get_data('1991/silence_of_the_lambs')['imdbId'])
@ -22,7 +30,7 @@ def get_data(id):
    data = {
        'url': get_url(id)
    }
-    html = read_url(data['url'], unicode=True)
+    html = read_url(data['url'])
    data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
    if not data['imdbId']:
        data['imdbId'] = _id_map.get(id, '')
@ -37,11 +45,11 @@ def get_data(id):
    for result in results:
        result = result.replace('_xlg.html', '.html')
        url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
-        html = read_url(url, unicode=True)
+        html = read_url(url)
        result = find_re(html, '<a href = (\w*?_xlg.html)')
        if result:
            url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
-            html = read_url(url, unicode=True)
+            html = read_url(url)
            poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
        else:
            poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
@ -63,14 +71,14 @@ def get_id(url):
 def get_ids(page=None):
    ids = []
    if page:
-        html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1, unicode=True)
+        html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1)
        results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
        for result in results:
            url = 'http://impawards.com/%s' % result
            ids.append(get_id(url))
        return set(ids)
    # get all
-    html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60, unicode=True)
+    html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60)
    pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
    for page in range(pages, 0, -1):
        for id in get_ids(page):
@ -81,7 +89,7 @@ def get_ids(page=None):

 def get_url(id):
    url = u"http://www.impawards.com/%s.html" % id
-    html = read_url(url, unicode=True)
+    html = read_url(url)
    if find_re(html, "No Movie Posters on This Page"):
        url = u"http://www.impawards.com/%s_ver1.html" % id
    return url