(.*?) \(')) data['year'] = findRe(html, '\((.*?)\)') result = findRe(html, '')) for page in range(pages + 1, 0, -1): print "Page %d of %d" % (page, pages) if page <= pages: html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1) urls = parseArchivePage(html) for url in urls: html = getUrlUnicode(url) data = parseMoviePage(html) print data if '"' in data['posterUrl']: print url sys.exit() dirname = '%s/%s/%s/%s' % (pathname, data['imdbId'][:1], data['imdbId'][:4], data['imdbId']) filename = '%s/%s' % (dirname, os.path.split(data['posterUrl'])[1]) if not os.path.exists(filename): jpg = getUrl(data['posterUrl']) if not os.path.exists(dirname): os.makedirs(dirname) f = open(filename, 'w') f.write(jpg) f.close() def cleanup(): for dirname, dirs, files in os.walk('/Volumes/Rolux Home/Desktop/Data/impawards.com'): for filename in files: if '"' in filename: print filename os.remove(dirname + '/' + filename) if __name__ == '__main__': # cleanup() archivePosters() getMovieData('Brick', 'Rian Johnson')