fix some tests and urls

2016-05-21 15:19:25 +02:00 · 2016-05-21 15:19:25 +02:00 · 7695a9c015
commit 7695a9c015
parent 5355dbf821
7 changed files with 60 additions and 238 deletions
--- a/ox/web/impawards.py
+++ b/ox/web/impawards.py
@ -10,14 +10,14 @@ from ox.text import find_re

 def get_data(id):
    '''
-    >>> get_data('1991/silence_of_the_lambs')['imdbId']
-    u'0102926'
+    >>> str(get_data('1991/silence_of_the_lambs')['imdbId'])
+    '0102926'

-    >>> get_data('1991/silence_of_the_lambs')['posters'][0]
-    u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
+    >>> str(get_data('1991/silence_of_the_lambs')['posters'][0])
+    'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'

-    >>> get_data('1991/silence_of_the_lambs')['url']
-    u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
+    >>> str(get_data('1991/silence_of_the_lambs')['url'])
+    'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
    '''
    data = {
        'url': get_url(id)
@ -46,7 +46,6 @@ def get_data(id):
        else:
            poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
        data['posters'].append(poster)
-
    return data

 def get_id(url):
@ -60,24 +59,26 @@ def get_id(url):
    id = '%s/%s' % (year, '_'.join(split))
    return id

+
 def get_ids(page=None):
    ids = []
    if page:
-        html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
+        html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1, unicode=True)
        results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
        for result in results:
            url = 'http://impawards.com/%s' % result
            ids.append(get_id(url))
        return set(ids)
-    #get all
-    html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
+    # get all
+    html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60, unicode=True)
    pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
    for page in range(pages, 0, -1):
        for id in get_ids(page):
-            if not id in ids:
+            if id not in ids:
                ids.append(id)
    return ids

+
 def get_url(id):
    url = u"http://www.impawards.com/%s.html" % id
    html = read_url(url, unicode=True)