310 lines
11 KiB
Python
310 lines
11 KiB
Python
# vi:si:et:sw=4:sts=4:ts=4
|
|
# encoding: utf-8
|
|
from __future__ import print_function
|
|
import re
|
|
|
|
import ox.cache
|
|
from ox.html import strip_tags
|
|
from ox.text import find_re
|
|
|
|
|
|
def read_url(url, timeout=ox.cache.cache_timeout):
|
|
data = ox.cache.read_url(url, timeout=timeout)
|
|
try:
|
|
data = data.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
data = data.decode('latin-1')
|
|
return data
|
|
|
|
def get_data(id):
|
|
'''
|
|
>>> str(get_data('1991/silence_of_the_lambs')['imdbId'])
|
|
'0102926'
|
|
|
|
>>> str(get_data('1991/silence_of_the_lambs')['posters'][0])
|
|
'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
|
|
|
|
>>> str(get_data('1991/silence_of_the_lambs')['url'])
|
|
'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
|
'''
|
|
data = {
|
|
'url': get_url(id)
|
|
}
|
|
html = read_url(data['url'])
|
|
data['imdbId'] = find_re(html, r'imdb.com/title/tt(\d{7})')
|
|
if not data['imdbId']:
|
|
data['imdbId'] = _id_map.get(id, '')
|
|
data['title'] = strip_tags(find_re(html, r'<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
|
data['year'] = find_re(html, r'\(<a href="alpha1.html">(.*?)</a>\)')
|
|
data['posters'] = []
|
|
poster = find_re(html, r'<img src="(posters.*?)"')
|
|
if poster:
|
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
|
|
data['posters'].append(poster)
|
|
results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)
|
|
for result in results:
|
|
result = result.replace('_xlg.html', '.html')
|
|
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
|
html = read_url(url)
|
|
result = find_re(html, r'<a href = (\w*?_xlg.html)')
|
|
if result:
|
|
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
|
html = read_url(url)
|
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
|
|
else:
|
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
|
|
data['posters'].append(poster)
|
|
return data
|
|
|
|
def get_id(url):
|
|
split = url.split('/')
|
|
year = split[3]
|
|
split = split[4][:-5].split('_')
|
|
if split[-1] == 'xlg':
|
|
split.pop()
|
|
if find_re(split[-1], r'ver\d+$'):
|
|
split.pop()
|
|
id = '%s/%s' % (year, '_'.join(split))
|
|
return id
|
|
|
|
|
|
def get_ids(page=None):
|
|
ids = []
|
|
if page:
|
|
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1)
|
|
results = re.compile(r'<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
|
for result in results:
|
|
url = 'http://impawards.com/%s' % result
|
|
ids.append(get_id(url))
|
|
return set(ids)
|
|
# get all
|
|
html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60)
|
|
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
|
for page in range(pages, 0, -1):
|
|
for id in get_ids(page):
|
|
if id not in ids:
|
|
ids.append(id)
|
|
return ids
|
|
|
|
|
|
def get_url(id):
|
|
url = u"http://www.impawards.com/%s.html" % id
|
|
html = read_url(url)
|
|
if find_re(html, "No Movie Posters on This Page"):
|
|
url = u"http://www.impawards.com/%s_ver1.html" % id
|
|
return url
|
|
|
|
_id_map = {
|
|
'1933/forty_second_street': '0024034',
|
|
'1933/tarzan_the_fearless': '0024645',
|
|
'1935/informer': '0026529',
|
|
'1935/thirty_nine_steps': '0026529',
|
|
'1935/top_hat': '0027125',
|
|
'1938/charlie_chaplin_cavalcade': '0284687',
|
|
'1943/falcon_and_the_co-eds': '035855',
|
|
'1969/angel_angel_down_we_go': '0065602',
|
|
'1970/crimson_altar': '0062833',
|
|
'1975/man_who_would_be_king_ver1': '0073341',
|
|
'1975/picnic_at_hanging_rock_ver1': '0073540',
|
|
'1979/electric_horseman_ver1': '0079100',
|
|
'1980/caligula_ver1': '0080491',
|
|
'1980/hollywood_knights_ver1': '0080881',
|
|
'1981/history_of_the_world_part_i': '0082517',
|
|
'1981/sea_wolves': '0081470',
|
|
'1983/krull_ver1': '0085811',
|
|
'1985/warriors_of_the_wind': '0087544',
|
|
'1989/friday_the_thirteenth_part_viii_ver1': '0097388',
|
|
'1989/high_hopes': '0095302',
|
|
'1989/millenium': '0097883',
|
|
'1989/story_of_women': '0096336',
|
|
'1990/edward_scissorhands_ver1': '0099487',
|
|
'1991/freddys_dead_ver1': '0101917',
|
|
'1993/robocop_three_ver1': '0107978',
|
|
'1993/waynes_world_two_ver1': '0108525',
|
|
'1994/above_the_rim_ver1': '0109035',
|
|
'1994/helas_pour_moi': '0107175',
|
|
'1994/house_of_the_spirits_ver1': '0107151',
|
|
'1994/i_dont_want_to_talk_about_it': '0106678',
|
|
'1994/in_custody': '0107199',
|
|
'1994/ladybird_ladybird': '0110296',
|
|
'1994/leon_the_pig_farmer': '0104710',
|
|
'1994/love_after_love': '0103710',
|
|
'1994/l_six_two_seven': '0104658',
|
|
'1994/martin_lawrence_you_so_crazy_ver1': '0111804',
|
|
'1994/savage_nights': '0105032',
|
|
'1994/sex_drugs_and_democracy': '0111135',
|
|
'1995/bye_bye_love': '0112606',
|
|
'1995/cold_comfort_farm': '0112701',
|
|
'1995/gumby_the_movie': '0113234',
|
|
'1995/les_miserables': '0113828',
|
|
'1995/mystery_of_rampo': '0110943',
|
|
'1995/pharaohs_army': '0114122',
|
|
'1995/pure_formality': '0110917',
|
|
'1995/quick_and_the_dead_ver1': '0114214',
|
|
'1995/reflections_in_the_dark': '0110956',
|
|
'1995/safe_ver1': '0114323',
|
|
'1995/search_and_destroy': '0114371',
|
|
'1995/secret_of_roan_inish_ver1': '0111112',
|
|
'1995/underneath': '0114788',
|
|
'1996/ghost_in_the_shell': '0113568',
|
|
'1996/hate': '0113247',
|
|
'1996/horseman_on_the_roof': '0113362',
|
|
'1996/kids_in_the_hall_brain_candy': '0116768',
|
|
'1996/maybe_maybe_not': '0109255',
|
|
'1996/prisoner_of_the_mountains': '0116754',
|
|
'1997/fifth_element_ver1': '0119116',
|
|
'1997/fools_rush_in_ver1': '0119141',
|
|
'1997/gi_jane_ver1': '0119173',
|
|
'1997/happy_together_ver1': '0118845',
|
|
'1997/lilies': '0116882',
|
|
'1997/mouth_to_mouth': '0112546',
|
|
'1997/mr_nice_guy': '0117786',
|
|
'1997/nenette_and_boni': '0117221',
|
|
'1997/paperback_romance': '0110405',
|
|
'1997/second_jungle_book': '0120087',
|
|
'1997/single_girl': '0113057',
|
|
'1997/super_speedway': '0120245',
|
|
'1997/temptress_moon': '0116295',
|
|
'1998/alarmist': '0119534',
|
|
'1998/barneys_great_adventure_the_movie': '0120598',
|
|
'1998/bulworth_ver1': '0118798',
|
|
'1998/celebration': '0154420',
|
|
'1998/east_palace_west_palace': '0119007',
|
|
'1998/hurricane_streets': '0119338',
|
|
'1998/i_married_a_strange_person': '0119346',
|
|
'1998/inheritors': '0141824',
|
|
'1998/killing_time': '0140312',
|
|
'1998/live_flesh': '0118819',
|
|
'1998/music_from_another_room': '0119734',
|
|
'1998/post_coitum_ver1': '0119923',
|
|
'1998/steam_the_turkish_bath': '0119248',
|
|
'1998/velocity_of_gary': '0120878',
|
|
'1999/after_life': '0165078',
|
|
'1999/emperor_and_the_assassin': '0162866',
|
|
'1999/fantasia_two_thousand': '0120910',
|
|
'1999/get_bruce': '0184510',
|
|
'1999/god_said_ha': '0119207',
|
|
'1999/jawbreaker': '0155776',
|
|
'1999/jeanne_and_the_perfect_guy': '0123923',
|
|
'1999/king_and_i': '0160429',
|
|
'1999/lovers_of_the_arctic_circle': '0133363',
|
|
'1999/plunkett_and_macleane': '0134033',
|
|
'1999/pokemon_the_first_movie': '0190641',
|
|
'1999/school_of_flesh': '0157208',
|
|
'1999/splendor': '0127296',
|
|
'1999/stranger_in_the_kingdom': '0126680',
|
|
'1999/train_of_life': '0170705',
|
|
'1999/twice_upon_a_yesterday': '0138590',
|
|
'1999/whiteboys': '0178988',
|
|
'1999/wildfire': '0194544',
|
|
'1999/windhorse': '0169388',
|
|
'2000/claim': '0218378',
|
|
'2000/color_of_paradise': '0191043',
|
|
'2000/criminal_lovers': '0205735',
|
|
'2000/everlasting_piece': '0218182',
|
|
'2000/girl_on_the_bridge_ver1': '0144201',
|
|
'2000/godzilla_two_thousand': '0188640',
|
|
'2000/goya_in_bordeaux': '0210717',
|
|
'2000/mad_about_mambo': '0156757',
|
|
'2000/picking_up_the_pieces': '0192455',
|
|
'2000/pokemon_the_movie_2000': '0257001',
|
|
'2000/seven_days_to_live': '0221928',
|
|
'2000/south_of_heaven_west_of_hell': '0179473',
|
|
'2000/suzhou_river': '0234837',
|
|
'2000/time_for_drunken_horses': '0259072',
|
|
'2000/venus_beauty_institute': '0174330',
|
|
'2001/circle': '0368646',
|
|
'2001/devils_backbone': '0256009',
|
|
'2001/kill_me_later': '0243595',
|
|
'2001/king_is_dancing': '0244173',
|
|
'2001/learning_curve': '0219126',
|
|
'2001/marco_polo__return_to_xanadu_ver1': '0296074',
|
|
'2001/me_you_them': '0244504',
|
|
'2001/our_lady_of_the_assassins': '0250809',
|
|
'2001/pinero': '0261066',
|
|
'2001/pokemon_three_the_movie_ver1': '0266860',
|
|
'2001/scratch': '0143861',
|
|
'2001/vampire_hunter_d_bloodlust_ver1': '0216651',
|
|
'2002/el_bosque_animado': '0310790',
|
|
'2002/fifty_first_state': '0227984',
|
|
'2002/les_destinees': '0216689',
|
|
'2002/sons_room': '0208990',
|
|
'2003/open_hearts': '0315543',
|
|
'2003/tulse_luper_suitcases': '0307596',
|
|
'2003/valentin': '0296915',
|
|
'2004/if_only_ver1': '0332136',
|
|
'2004/wondrous_oblivion': '0334725',
|
|
'2005/wu_ji': '0417976',
|
|
'2006/golden_door': '0465188',
|
|
'2006/kin': '1091189',
|
|
'2007/revenge_of_the_nerds': '0088000',
|
|
'2008/bad_batch': '1605644',
|
|
'2008/mercedes': '1368083',
|
|
'2008/spirit': '0831887',
|
|
'2009/dead_air': '0993841',
|
|
'2009/edge_of_love': '0819714',
|
|
'2009/fuel': '1072437',
|
|
'2009/fuel': '1072437',
|
|
'2009/one_good_man': '1239357',
|
|
'2009/st_trinians': '1210106',
|
|
'2009/surveillance': '0409345',
|
|
'2009/taken': '0936501',
|
|
'2009/vaml': '1610453',
|
|
'2010/adopting_haiti': '1764164',
|
|
'2010/afterlife': '0838247',
|
|
'2010/agora': '1186830',
|
|
'2010/athlete': '1356996',
|
|
'2010/beneath_the_blue': '1222698',
|
|
'2010/bitch_slap': '1212974',
|
|
'2010/black_waters_of_echos_pond': '0960066',
|
|
'2010/case_thirty_nine': '0795351',
|
|
'2010/finite_and_infinite_games': '1772268',
|
|
'2010/hole': '1085779',
|
|
'2010/jolene': '0867334',
|
|
'2010/lake_mungo': '0816556',
|
|
'2010/last_day_of_summer': '1242544',
|
|
'2010/leaves_of_grass': '1151359',
|
|
'2010/life_of_lemon': '1466057',
|
|
'2010/man_in_the_maze': '1721692',
|
|
'2010/mr_immortality_the_life_and_times_of_twista': '1711017',
|
|
'2010/paper_man': '0437405',
|
|
'2010/perfect_game': '0473102',
|
|
'2010/red_baron': '0365675',
|
|
'2010/satin': '0433397',
|
|
'2010/shutter_island': '1130884',
|
|
'2010/strange_powers': '1534075',
|
|
'2010/suicidegirls_must_die': '1584733',
|
|
'2010/veronika_decides_to_die': '1068678',
|
|
'2010/witchblade': '0494292',
|
|
'2010/youth_in_revolt': '0403702',
|
|
'2011/beastly': '1152398',
|
|
'2011/burning_palms': '1283887',
|
|
'2011/cabin_in_the_woods': '1259521',
|
|
'2011/conan': '0816462',
|
|
'2011/courageous': '1630036',
|
|
'2011/cruces_divided_two': '1698645',
|
|
'2011/green_with_envy': '1204342',
|
|
'2011/happythankyoumoreplease': '1481572',
|
|
'2011/homework': '1645080',
|
|
'2011/i_got_next': '1915570',
|
|
'2011/lebanon_pa': '1290082',
|
|
'2011/money_pet': '1965198',
|
|
'2011/my_suicide': '0492896',
|
|
'2011/priest': '0822847',
|
|
'2011/prowl': '1559033',
|
|
'2011/red_sonja': '0800175',
|
|
'2011/season_of_the_witch': '0479997',
|
|
'2011/stay_cool': '1235807',
|
|
'2011/sympathy_for_delicious': '1270277',
|
|
'2011/trust': '1529572',
|
|
'2011/undefeated': '1961604',
|
|
'2011/vanishing_on_seventh_street': '1452628',
|
|
'2011/where_is_robert_fisher': '2042712',
|
|
'2011/yellowbrickroad': '1398428',
|
|
'2012/haywire': '1506999',
|
|
'2012/last_call_at_the_oasis': '2043900',
|
|
}
|
|
|
|
if __name__ == '__main__':
|
|
ids = get_ids()
|
|
print(sorted(ids), len(ids))
|