fix some tests and urls

This commit is contained in:
j 2016-05-21 15:19:25 +02:00
commit 7695a9c015
7 changed files with 60 additions and 238 deletions

View file

@ -10,14 +10,14 @@ from ox.text import find_re
def get_data(id):
'''
>>> get_data('1991/silence_of_the_lambs')['imdbId']
u'0102926'
>>> str(get_data('1991/silence_of_the_lambs')['imdbId'])
'0102926'
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
>>> str(get_data('1991/silence_of_the_lambs')['posters'][0])
'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
>>> get_data('1991/silence_of_the_lambs')['url']
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
>>> str(get_data('1991/silence_of_the_lambs')['url'])
'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
'''
data = {
'url': get_url(id)
@ -46,7 +46,6 @@ def get_data(id):
else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
data['posters'].append(poster)
return data
def get_id(url):
@ -60,24 +59,26 @@ def get_id(url):
id = '%s/%s' % (year, '_'.join(split))
return id
def get_ids(page=None):
ids = []
if page:
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1, unicode=True)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
ids.append(get_id(url))
return set(ids)
#get all
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
# get all
html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60, unicode=True)
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1):
for id in get_ids(page):
if not id in ids:
if id not in ids:
ids.append(id)
return ids
def get_url(id):
url = u"http://www.impawards.com/%s.html" % id
html = read_url(url, unicode=True)