# -*- coding: UTF-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re import ox.imdb as imdb from oxutils.cache import getUrlUnicode from oxutils.html import stripTags from oxutils.text import findRe, removeSpecialCharacters def getData(criterionId): ''' >>> getData(348)['imdbId'] '0068205' ''' data = {} html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % criterionId) data['criterionId'] = criterionId data['posterUrl'] = 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % criterionId data['synopsis'] = stripTags(findRe(html, '

Synopsis

(.*?)')) result = re.compile("The Criterion Collection: (.*?) by (.*?)").findall(html) data['title'] = stripTags(result[0][0]) data['director'] = stripTags(result[0][1]) data['imdbId'] = imdb.getMovieId(data['title'], data['director']) return data def getCriterionIds(): html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine') return re.compile('release.asp\?id=(.*?)"').findall(html) def getMovieId(title = '', director = '', imdbId = ''): if not imdbId: imdbId = imdb.getMovieId(title, director) html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine', timeout = -1) strings = findRe(html, '(.*?)
').split('') strings.pop(0) for string in strings: id = findRe(string, '"release.asp\?id=(.*?)"') criterionTitle = findRe(string, 'class="title">(.*?)') criterionTitle = re.sub('(?<=\\w)
(?=\\w)', ' / ', criterionTitle) criterionTitle = criterionTitle.replace('
', '') criterionDirector = stripTags(findRe(string, '.*?(.*?)')).strip() if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId: return id return '' def getMovieData(title = '', director = '', imdbId = ''): data = {} if not imdbId: imdbId = imdb.getMovieId(title, director) id = getMovieId(imdbId = imdbId) if id: html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id) data['id'] = id data['posterUrl'] = 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % id data['synopsis'] = stripTags(findRe(html, '

Synopsis

(.*?)')) return data if __name__ == '__main__': print getMovieData('Le mepris', 'Jean-Luc Godard')