# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import ox.imdb as imdb
from oxutils.cache import getUrlUnicode
from oxutils.html import stripTags
from oxutils.text import findRe, removeSpecialCharacters
def getData(criterionId):
'''
>>> getData(348)['imdbId']
'0068205'
'''
data = {}
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % criterionId)
data['criterionId'] = criterionId
data['posterUrl'] = 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % criterionId
data['synopsis'] = stripTags(findRe(html, '
Synopsis
(.*?)'))
result = re.compile("The Criterion Collection: (.*?) by (.*?)").findall(html)
data['title'] = stripTags(result[0][0])
data['director'] = stripTags(result[0][1])
data['imdbId'] = imdb.getMovieId(data['title'], data['director'])
return data
def getCriterionIds():
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine')
return re.compile('release.asp\?id=(.*?)"').findall(html)
def getMovieId(title = '', director = '', imdbId = ''):
if not imdbId:
imdbId = imdb.getMovieId(title, director)
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine', timeout = -1)
strings = findRe(html, '').split('')
strings.pop(0)
for string in strings:
id = findRe(string, '"release.asp\?id=(.*?)"')
criterionTitle = findRe(string, 'class="title">(.*?)')
criterionTitle = re.sub('(?<=\\w)
(?=\\w)', ' / ', criterionTitle)
criterionTitle = criterionTitle.replace('
', '')
criterionDirector = stripTags(findRe(string, '.*?(.*?)')).strip()
if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId:
return id
return ''
def getMovieData(title = '', director = '', imdbId = ''):
data = {}
if not imdbId:
imdbId = imdb.getMovieId(title, director)
id = getMovieId(imdbId = imdbId)
if id:
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id)
data['id'] = id
data['posterUrl'] = 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % id
data['synopsis'] = stripTags(findRe(html, 'Synopsis
(.*?)'))
return data
if __name__ == '__main__':
print getMovieData('Le mepris', 'Jean-Luc Godard')