import re import ox.imdb as imdb from oxutils.cache import getUrlUnicode from oxutils.html import stripTags from oxutils.text import findRe def getMovieData(title = '', director = '', imdbId = ''): data = {'posterUrls': []} if not imdbId: imdbId = imdb.getMovieId(title, director) print imdbId html = getUrlUnicode('http://impawards.com/archives/latest.html', timeout = 0) pages = int(findRe(html, '')) for page in range(pages + 1, 0, -1): print page if page <= pages: html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1) urls = parseArchivePage(html) print urls for url in urls: html = getUrlUnicode(url) d = parseMoviePage(html) print d if d['imdbId'] == imdbId: data['posterUrls'].append(d['posterUrl']) print d['posterUrl'] data['posterUrls'].sort() return data def parseArchivePage(html): urls = [] results = re.compile('', re.DOTALL).findall(html) for result in results: urls.append('http://impawards.com/%s' % result) return urls def parseMoviePage(html): data = {} data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ') data['title'] = stripTags(findRe(html, '
')) for page in range(pages + 1, 0, -1): if page <= pages: html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1) urls = parseArchivePage(html) print urls for url in urls: html = getUrlUnicode(url) data = parseMoviePage(html) dirname = '%s/%s/%s' % (pathname, data['imdbId'][:4], data['imdbId']) filename = '%s/%s' % (dirname, os.path.split(data['posterUrl'])[1]) if not os.path.exists(filename): jpg = getUrl(data['posterUrl']) if not os.path.exists(dirname): os.makedirs(dirname) f = open(filename, 'w') f.write(jpg) f.close() if __name__ == '__main__': archivePosters() getMovieData('Brick', 'Rian Johnson') |