diff --git a/ox/impawards.py b/ox/impawards.py index 2f0d79e..4f65569 100644 --- a/ox/impawards.py +++ b/ox/impawards.py @@ -2,6 +2,7 @@ import re import ox.imdb as imdb from oxutils.cache import getUrlUnicode +from oxutils.html import stripTags from oxutils.text import findRe @@ -38,7 +39,7 @@ def parseArchivePage(html): def parseMoviePage(html): data = {} data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ') - data['title'] = findRe(html, '(.*?) \(') + data['title'] = stripTags(findRe(html, '(.*?) \(')) data['year'] = findRe(html, '\((.*?)\)') result = findRe(html, '') if result: @@ -55,5 +56,31 @@ def parsePosterPage(html, year): data['posterUrl'] = 'http://impawards.com/%s/%s' % (year, findRe(html, '')) + for page in range(pages + 1, 0, -1): + if page <= pages: + html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1) + urls = parseArchivePage(html) + print urls + for url in urls: + html = getUrlUnicode(url) + data = parseMoviePage(html) + dirname = '%s/%s/%s' % (pathname, data['imdbId'][:4], data['imdbId']) + filename = '%s/%s' % (dirname, os.path.split(data['posterUrl'])[1]) + if not os.path.exists(filename): + jpg = getUrl(data['posterUrl']) + if not os.path.exists(dirname): + os.makedirs(dirname) + f = open(filename, 'w') + f.write(jpg) + f.close() + + if __name__ == '__main__': + archivePosters() getMovieData('Brick', 'Rian Johnson') \ No newline at end of file