2008-05-10 07:38:14 +00:00
|
|
|
# -*- coding: UTF-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
2008-05-08 08:25:44 +00:00
|
|
|
import re
|
|
|
|
|
|
|
|
import ox.imdb as imdb
|
2008-05-09 10:21:59 +00:00
|
|
|
from oxutils.cache import getUrlUnicode
|
2008-05-08 08:25:44 +00:00
|
|
|
from oxutils.html import stripTags
|
2008-05-10 07:38:14 +00:00
|
|
|
from oxutils.text import findRe, removeSpecialCharacters
|
|
|
|
|
|
|
|
def getData(criterionId):
|
|
|
|
'''
|
|
|
|
>>> getData(348)['imdbId']
|
|
|
|
'0068205'
|
|
|
|
'''
|
|
|
|
data = {}
|
|
|
|
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % criterionId)
|
|
|
|
data['criterionId'] = criterionId
|
|
|
|
data['posterUrl'] = 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % criterionId
|
|
|
|
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
|
|
|
|
result = re.compile("<title>The Criterion Collection: (.*?) by (.*?)</title>").findall(html)
|
|
|
|
data['title'] = stripTags(result[0][0])
|
|
|
|
data['director'] = stripTags(result[0][1])
|
|
|
|
data['imdbId'] = imdb.getMovieId(data['title'], data['director'])
|
|
|
|
return data
|
|
|
|
|
|
|
|
def getCriterionIds():
|
|
|
|
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine')
|
|
|
|
return re.compile('release.asp\?id=(.*?)"').findall(html)
|
2008-05-08 08:25:44 +00:00
|
|
|
|
2008-05-09 10:39:20 +00:00
|
|
|
def getMovieId(title = '', director = '', imdbId = ''):
|
|
|
|
if not imdbId:
|
|
|
|
imdbId = imdb.getMovieId(title, director)
|
2008-05-09 10:21:59 +00:00
|
|
|
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine', timeout = -1)
|
2008-05-08 08:25:44 +00:00
|
|
|
strings = findRe(html, '<table cellspacing="0" id="browse-all-table">(.*?)</table>').split('<tr>')
|
|
|
|
strings.pop(0)
|
|
|
|
for string in strings:
|
2008-05-09 10:39:20 +00:00
|
|
|
id = findRe(string, '"release.asp\?id=(.*?)"')
|
2008-05-08 08:25:44 +00:00
|
|
|
criterionTitle = findRe(string, 'class="title">(.*?)</a>')
|
|
|
|
criterionTitle = re.sub('(?<=\\w)<br>(?=\\w)', ' / ', criterionTitle)
|
|
|
|
criterionTitle = criterionTitle.replace('<br>', '')
|
2008-05-08 08:32:14 +00:00
|
|
|
criterionDirector = stripTags(findRe(string, '</a>.*?</td>(.*?)</td>')).strip()
|
|
|
|
if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId:
|
2008-05-09 10:39:20 +00:00
|
|
|
return id
|
2008-05-08 08:25:44 +00:00
|
|
|
return ''
|
|
|
|
|
2008-05-09 10:39:20 +00:00
|
|
|
def getMovieData(title = '', director = '', imdbId = ''):
|
2008-05-09 11:21:42 +00:00
|
|
|
data = {}
|
2008-05-09 10:39:20 +00:00
|
|
|
if not imdbId:
|
|
|
|
imdbId = imdb.getMovieId(title, director)
|
|
|
|
id = getMovieId(imdbId = imdbId)
|
|
|
|
if id:
|
|
|
|
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id)
|
2008-05-09 11:21:42 +00:00
|
|
|
data['id'] = id
|
2008-05-09 10:39:20 +00:00
|
|
|
data['posterUrl'] = 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % id
|
2008-05-09 11:21:42 +00:00
|
|
|
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
|
|
|
|
return data
|
2008-05-09 10:39:20 +00:00
|
|
|
|
2008-05-08 08:25:44 +00:00
|
|
|
if __name__ == '__main__':
|
2008-05-10 07:38:14 +00:00
|
|
|
print getMovieData('Le mepris', 'Jean-Luc Godard')
|