diff --git a/ox/criterion.py b/ox/criterion.py index 6daf4d7..d1fcb10 100644 --- a/ox/criterion.py +++ b/ox/criterion.py @@ -22,16 +22,16 @@ def getMovieId(title = '', director = '', imdbId = ''): return '' def getMovieData(title = '', director = '', imdbId = ''): + data = {} if not imdbId: imdbId = imdb.getMovieId(title, director) id = getMovieId(imdbId = imdbId) if id: html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id) - data = {} - data['synopsis'] = stripTags(findRe(html, '

Synopsis

(.*?)')) + data['id'] = id data['posterUrl'] = 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % id - return data - return {} + data['synopsis'] = stripTags(findRe(html, '

Synopsis

(.*?)')) + return data if __name__ == '__main__': print getMovieData('Le mepris', 'Jean-Luc Godard') \ No newline at end of file diff --git a/ox/impawards.py b/ox/impawards.py new file mode 100644 index 0000000..71881d5 --- /dev/null +++ b/ox/impawards.py @@ -0,0 +1,45 @@ +import re + +import ox.imdb as imdb +from oxutils.cache import getUrlUnicode +from oxutils.text import findRe + + +def getMovieData(title = '', director = '', imdbId = ''): + data = {'posterUrls': []} + if not imdbId: + imdbId = imdb.getMovieId(title, director) + print imdbId + html = getUrlUnicode('http://impawards.com/archives/latest.html', timeout = 0) + pages = int(findRe(html, '')) + for page in range(pages + 1, 0, -1): + print page + if page <= pages: + html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1) + urls = parseArchivePage(html) + print urls + for url in urls: + html = getUrlUnicode(url) + d = parseMoviePage(html) + print d + if d['imdbId'] == imdbId: + data['posterUrls'].append(d['posterUrl']) + print d['posterUrl'] + data['posterUrls'].sort() + return data + +def parseArchivePage(html): + urls = [] + results = re.compile('', re.DOTALL).findall(html) + for result in results: + urls.append('http://impawards.com/%s' % result) + return urls + +def parseMoviePage(html): + data = {} + data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ') + data['posterUrl'] = 'http://impawards.com/%s' % findRe(html, '