# vi:si:et:sw=4:sts=4:ts=4 # encoding: utf-8 import re from ox.cache import readUrlUnicode from ox.html import stripTags from ox.text import findRe import imdb def getData(id): ''' >>> getData('1991/silence_of_the_lambs')['imdbId'] u'0102926' >>> getData('1991/silence_of_the_lambs')['posters'][0] u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg' >>> getData('1991/silence_of_the_lambs')['url'] u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html' ''' data = { 'url': getUrl(id) } html = readUrlUnicode(data['url']) data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})') data['title'] = stripTags(findRe(html, '
(.*?) \('))
data['year'] = findRe(html, '\((.*?)\)')
data['posters'] = []
poster = findRe(html, '')) + 1
for page in range(pages, 0, -1):
for id in getIdsByPage(page):
if not id in ids:
ids.append(id)
return ids
def getIdsByPage(page):
ids = []
html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
results = re.compile('', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
ids.append(getId(url))
return set(ids)
def getUrl(id):
url = u"http://www.impawards.com/%s.html" % id
html = readUrlUnicode(url)
if findRe(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id
return url
if __name__ == '__main__':
ids = getIds()
print sorted(ids), len(ids)