new criterion module

This commit is contained in:
Rolux 2009-07-04 12:25:24 +02:00
parent 10e9f54153
commit e01af3fdd7
2 changed files with 98 additions and 51 deletions

View file

@ -4,62 +4,102 @@ import re
from oxlib.cache import getUrlUnicode from oxlib.cache import getUrlUnicode
from oxlib.html import stripTags from oxlib.html import stripTags
from oxlib.net import getUrl
from oxlib.text import findRe, removeSpecialCharacters from oxlib.text import findRe, removeSpecialCharacters
import imdb import imdb
def getData(criterionId): def getIds():
ids = []
html = getUrlUnicode("http://www.criterion.com/library/dvd")
results = re.compile("page=(.*?)\"").findall(html)
pages = int(results[len(results) - 2])
for page in range(1, pages + 1):
html = getUrlUnicode("http://www.criterion.com/library/dvd?page=" + str(page))
results = re.compile("films/(.*?)\"").findall(html)
for result in results:
ids.append(result)
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = getUrlUnicode("http://www.criterion.com/boxsets/" + result)
results = re.compile("films/(.*?)\"").findall(html)
for result in results:
ids.append(result)
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
def getData(id):
''' '''
>>> getData(348)['imdbId'] >>> getData('1333')['imdbId']
'0068205' '0060304'
>>> getData('236')['posterUrl']
'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg'
>>> getData('786')['posterUrl']
'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg'
''' '''
data = {} data = {}
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % criterionId) data['id'] = id
data['criterionId'] = criterionId try:
data['posterUrl'] = getPosterUrl(criterionId) html = getUrlUnicode("http://www.criterion.com/films/" + id)
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>')) except:
result = re.compile("<title>The Criterion Collection: (.*?) by (.*?)</title>").findall(html) html = getUrl("http://www.criterion.com/films/" + id)
data['title'] = stripTags(result[0][0]) data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>")
data['director'] = stripTags(result[0][1]) data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>")
data['imdbId'] = imdb.getMovieId(data['title'], data['director']) data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>")
results = re.compile("<p><strong>(.*?)</strong></p>").findall(html)
data["country"] = results[0]
data["year"] = results[1]
result = findRe(html, "<div class=\"synopsis contentbox lightgray\">(.*?)</div>")
data["synopsis"] = findRe(result, "<p>(.*?)</p>")
result = findRe(html, "<div class=\"editioninfo\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
result = re.compile("<div class=\"editioninfo\">(.*?)</div>", re.DOTALL).findall(html)[1]
result = findRe(result, "<a href=\"(.*?)\">")
if not "/boxsets/" in result:
data["posterUrl"] = result
else:
html_ = getUrlUnicode(result)
result = findRe(html_, "<a href=\"http://www.criterion.com/films/%s\">(.*?)</a>" % id)
result = findRe(result, "src=\"(.*?)\"")
data["posterUrl"] = result.replace("_w100", "")
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result:
data["stillUrl"] = result
data["trailerUrl"] = ""
else:
data["stillUrl"] = findRe(html, "\"thumbnailURL\", \"(.*?)\"")
data["trailerUrl"] = findRe(html, "\"videoURL\", \"(.*?)\"")
data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year'])
return data return data
def getCriterionIds(): def getPosterUrl(id):
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine') data = getData(id)
return re.compile('release.asp\?id=(.*?)"').findall(html) return data['posterUrl']
def getPosterUrl(criterionId): def getMovieId(title = '', director = '', year = '', imdbId = ''):
return 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % criterionId
def getMovieId(title = '', director = '', imdbId = ''):
if not imdbId: if not imdbId:
imdbId = imdb.getMovieId(title, director) imdbId = imdb.getMovieId(title, director, year)
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine', timeout = 86400) ids = getIds()
strings = findRe(html, '<table cellspacing="0" id="browse-all-table">(.*?)</table>').split('<tr>') for id in ids:
strings.pop(0) data = getData(id)
for string in strings: if imdb.getMovieId(data['title'], data['director'], data['year'] == imdbId):
id = findRe(string, '"release.asp\?id=(.*?)"')
criterionTitle = findRe(string, 'class="title">(.*?)</a>')
criterionTitle = re.sub('(?<=\\w)<br>(?=\\w)', ' / ', criterionTitle)
criterionTitle = criterionTitle.replace('<br>', '')
criterionDirector = stripTags(findRe(string, '</a>.*?</td>(.*?)</td>')).strip()
if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId:
return id return id
return '' return ''
def getMovieData(title = '', director = '', imdbId = ''): def getMovieData(title = '', director = '', year = '', imdbId = ''):
''' '''
>>> getMovieData('Le mepris', 'Jean-Luc Godard')['id'] >>> getMovieData('Pierrot le fou', 'Jean-Luc Godard', '1965')['id']
'171' '149'
''' '''
data = {} data = {}
if not imdbId: if not imdbId:
imdbId = imdb.getMovieId(title, director) imdbId = imdb.getMovieId(title, director, year)
id = getMovieId(imdbId = imdbId) id = getMovieId(imdbId = imdbId)
if id: if id:
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id) data_ = getData(id)
data['id'] = id data['id'] = data_['id']
data['posterUrl'] = getPosterUrl(id) data['posterUrl'] = data_['posterUrl']
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>')) data['synopsis'] = data_['synopsis']
return data return data

View file

@ -1,5 +1,6 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8 # encoding: utf-8
import os
import re import re
from oxlib.cache import getUrlUnicode from oxlib.cache import getUrlUnicode
@ -42,21 +43,15 @@ def parseArchivePage(html):
def parseMoviePage(html): def parseMoviePage(html):
data = {} data = {}
data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ') data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
data['title'] = stripTags(findRe(html, '<table WIDTH="400" BGCOLOR="#222222">(.*?) \(<a href="eligible.html">')) data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="eligible.html">(.*?)</a>\)') data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
result = findRe(html, '<a href = (\w*?_xlg.html) target= _blank>') result = findRe(html, '<a href = (\w*?_xlg.html)')
if result: if result:
url = 'http://impawards.com/%s/%s' % (data['year'], result) url = 'http://impawards.com/%s/%s' % (data['year'], result)
html = getUrlUnicode(url, timeout = -1) html = getUrlUnicode(url, timeout = -1)
d = parsePosterPage(html, data['year']) data['posterUrl'] = 'http://impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
data['posterUrl'] = d['posterUrl']
else: else:
data['posterUrl'] = 'http://impawards.com/%s/%s' % (data['year'], findRe(html, '<td align=center><br><img SRC="(.*?)"')) data['posterUrl'] = 'http://impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)" alt='))
return data
def parsePosterPage(html, year):
data = {}
data['posterUrl'] = 'http://impawards.com/%s/%s' % (year, findRe(html, '<img SRC="(.*?)"'))
return data return data
def archivePosters(): def archivePosters():
@ -64,16 +59,20 @@ def archivePosters():
from oxlib.net import getUrl from oxlib.net import getUrl
pathname = '/Volumes/Rolux Home/Desktop/Data/impawards.com' pathname = '/Volumes/Rolux Home/Desktop/Data/impawards.com'
html = getUrlUnicode('http://impawards.com/archives/latest.html', timeout = 0) html = getUrlUnicode('http://impawards.com/archives/latest.html', timeout = 0)
pages = int(findRe(html, '<a href = page(.*?).html>')) pages = int(findRe(html, '<a href= page(.*?).html>'))
for page in range(pages + 1, 0, -1): for page in range(pages + 1, 0, -1):
print "Page %d of %d" % (page, pages)
if page <= pages: if page <= pages:
html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1) html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1)
urls = parseArchivePage(html) urls = parseArchivePage(html)
print urls
for url in urls: for url in urls:
html = getUrlUnicode(url) html = getUrlUnicode(url)
data = parseMoviePage(html) data = parseMoviePage(html)
dirname = '%s/%s/%s' % (pathname, data['imdbId'][:4], data['imdbId']) print data
if '"' in data['posterUrl']:
print url
sys.exit()
dirname = '%s/%s/%s/%s' % (pathname, data['imdbId'][:1], data['imdbId'][:4], data['imdbId'])
filename = '%s/%s' % (dirname, os.path.split(data['posterUrl'])[1]) filename = '%s/%s' % (dirname, os.path.split(data['posterUrl'])[1])
if not os.path.exists(filename): if not os.path.exists(filename):
jpg = getUrl(data['posterUrl']) jpg = getUrl(data['posterUrl'])
@ -83,7 +82,15 @@ def archivePosters():
f.write(jpg) f.write(jpg)
f.close() f.close()
def cleanup():
for dirname, dirs, files in os.walk('/Volumes/Rolux Home/Desktop/Data/impawards.com'):
for filename in files:
if '"' in filename:
print filename
os.remove(dirname + '/' + filename)
if __name__ == '__main__': if __name__ == '__main__':
# cleanup()
archivePosters() archivePosters()
getMovieData('Brick', 'Rian Johnson') getMovieData('Brick', 'Rian Johnson')