python-oxweb/ox/criterion.py

37 lines
1.5 KiB
Python
Raw Normal View History

import re
import ox.imdb as imdb
2008-05-09 10:21:59 +00:00
from oxutils.cache import getUrlUnicode
from oxutils.html import stripTags
from oxutils.text import findRe
2008-05-09 10:39:20 +00:00
def getMovieId(title = '', director = '', imdbId = ''):
if not imdbId:
imdbId = imdb.getMovieId(title, director)
2008-05-09 10:21:59 +00:00
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine', timeout = -1)
strings = findRe(html, '<table cellspacing="0" id="browse-all-table">(.*?)</table>').split('<tr>')
strings.pop(0)
for string in strings:
2008-05-09 10:39:20 +00:00
id = findRe(string, '"release.asp\?id=(.*?)"')
criterionTitle = findRe(string, 'class="title">(.*?)</a>')
criterionTitle = re.sub('(?<=\\w)<br>(?=\\w)', ' / ', criterionTitle)
criterionTitle = criterionTitle.replace('<br>', '')
2008-05-08 08:32:14 +00:00
criterionDirector = stripTags(findRe(string, '</a>.*?</td>(.*?)</td>')).strip()
if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId:
2008-05-09 10:39:20 +00:00
return id
return ''
2008-05-09 10:39:20 +00:00
def getMovieData(title = '', director = '', imdbId = ''):
2008-05-09 11:21:42 +00:00
data = {}
2008-05-09 10:39:20 +00:00
if not imdbId:
imdbId = imdb.getMovieId(title, director)
id = getMovieId(imdbId = imdbId)
if id:
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id)
2008-05-09 11:21:42 +00:00
data['id'] = id
2008-05-09 10:39:20 +00:00
data['posterUrl'] = 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % id
2008-05-09 11:21:42 +00:00
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
return data
2008-05-09 10:39:20 +00:00
if __name__ == '__main__':
2008-05-09 10:39:20 +00:00
print getMovieData('Le mepris', 'Jean-Luc Godard')