new criterion module

This commit is contained in:
Rolux 2009-07-04 12:25:24 +02:00
parent 10e9f54153
commit e01af3fdd7
2 changed files with 98 additions and 51 deletions

View file

@ -4,62 +4,102 @@ import re
from oxlib.cache import getUrlUnicode
from oxlib.html import stripTags
from oxlib.net import getUrl
from oxlib.text import findRe, removeSpecialCharacters
import imdb
def getData(criterionId):
def getIds():
ids = []
html = getUrlUnicode("http://www.criterion.com/library/dvd")
results = re.compile("page=(.*?)\"").findall(html)
pages = int(results[len(results) - 2])
for page in range(1, pages + 1):
html = getUrlUnicode("http://www.criterion.com/library/dvd?page=" + str(page))
results = re.compile("films/(.*?)\"").findall(html)
for result in results:
ids.append(result)
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = getUrlUnicode("http://www.criterion.com/boxsets/" + result)
results = re.compile("films/(.*?)\"").findall(html)
for result in results:
ids.append(result)
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
def getData(id):
'''
>>> getData(348)['imdbId']
'0068205'
>>> getData('1333')['imdbId']
'0060304'
>>> getData('236')['posterUrl']
'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg'
>>> getData('786')['posterUrl']
'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg'
'''
data = {}
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % criterionId)
data['criterionId'] = criterionId
data['posterUrl'] = getPosterUrl(criterionId)
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
result = re.compile("<title>The Criterion Collection: (.*?) by (.*?)</title>").findall(html)
data['title'] = stripTags(result[0][0])
data['director'] = stripTags(result[0][1])
data['imdbId'] = imdb.getMovieId(data['title'], data['director'])
data['id'] = id
try:
html = getUrlUnicode("http://www.criterion.com/films/" + id)
except:
html = getUrl("http://www.criterion.com/films/" + id)
data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>")
data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>")
data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>")
results = re.compile("<p><strong>(.*?)</strong></p>").findall(html)
data["country"] = results[0]
data["year"] = results[1]
result = findRe(html, "<div class=\"synopsis contentbox lightgray\">(.*?)</div>")
data["synopsis"] = findRe(result, "<p>(.*?)</p>")
result = findRe(html, "<div class=\"editioninfo\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
result = re.compile("<div class=\"editioninfo\">(.*?)</div>", re.DOTALL).findall(html)[1]
result = findRe(result, "<a href=\"(.*?)\">")
if not "/boxsets/" in result:
data["posterUrl"] = result
else:
html_ = getUrlUnicode(result)
result = findRe(html_, "<a href=\"http://www.criterion.com/films/%s\">(.*?)</a>" % id)
result = findRe(result, "src=\"(.*?)\"")
data["posterUrl"] = result.replace("_w100", "")
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result:
data["stillUrl"] = result
data["trailerUrl"] = ""
else:
data["stillUrl"] = findRe(html, "\"thumbnailURL\", \"(.*?)\"")
data["trailerUrl"] = findRe(html, "\"videoURL\", \"(.*?)\"")
data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year'])
return data
def getCriterionIds():
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine')
return re.compile('release.asp\?id=(.*?)"').findall(html)
def getPosterUrl(id):
data = getData(id)
return data['posterUrl']
def getPosterUrl(criterionId):
return 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % criterionId
def getMovieId(title = '', director = '', imdbId = ''):
def getMovieId(title = '', director = '', year = '', imdbId = ''):
if not imdbId:
imdbId = imdb.getMovieId(title, director)
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine', timeout = 86400)
strings = findRe(html, '<table cellspacing="0" id="browse-all-table">(.*?)</table>').split('<tr>')
strings.pop(0)
for string in strings:
id = findRe(string, '"release.asp\?id=(.*?)"')
criterionTitle = findRe(string, 'class="title">(.*?)</a>')
criterionTitle = re.sub('(?<=\\w)<br>(?=\\w)', ' / ', criterionTitle)
criterionTitle = criterionTitle.replace('<br>', '')
criterionDirector = stripTags(findRe(string, '</a>.*?</td>(.*?)</td>')).strip()
if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId:
imdbId = imdb.getMovieId(title, director, year)
ids = getIds()
for id in ids:
data = getData(id)
if imdb.getMovieId(data['title'], data['director'], data['year'] == imdbId):
return id
return ''
def getMovieData(title = '', director = '', imdbId = ''):
def getMovieData(title = '', director = '', year = '', imdbId = ''):
'''
>>> getMovieData('Le mepris', 'Jean-Luc Godard')['id']
'171'
>>> getMovieData('Pierrot le fou', 'Jean-Luc Godard', '1965')['id']
'149'
'''
data = {}
if not imdbId:
imdbId = imdb.getMovieId(title, director)
imdbId = imdb.getMovieId(title, director, year)
id = getMovieId(imdbId = imdbId)
if id:
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id)
data['id'] = id
data['posterUrl'] = getPosterUrl(id)
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
data_ = getData(id)
data['id'] = data_['id']
data['posterUrl'] = data_['posterUrl']
data['synopsis'] = data_['synopsis']
return data

View file

@ -1,5 +1,6 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import os
import re
from oxlib.cache import getUrlUnicode
@ -42,21 +43,15 @@ def parseArchivePage(html):
def parseMoviePage(html):
data = {}
data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
data['title'] = stripTags(findRe(html, '<table WIDTH="400" BGCOLOR="#222222">(.*?) \(<a href="eligible.html">'))
data['year'] = findRe(html, '\(<a href="eligible.html">(.*?)</a>\)')
result = findRe(html, '<a href = (\w*?_xlg.html) target= _blank>')
data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
result = findRe(html, '<a href = (\w*?_xlg.html)')
if result:
url = 'http://impawards.com/%s/%s' % (data['year'], result)
html = getUrlUnicode(url, timeout = -1)
d = parsePosterPage(html, data['year'])
data['posterUrl'] = d['posterUrl']
data['posterUrl'] = 'http://impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
else:
data['posterUrl'] = 'http://impawards.com/%s/%s' % (data['year'], findRe(html, '<td align=center><br><img SRC="(.*?)"'))
return data
def parsePosterPage(html, year):
data = {}
data['posterUrl'] = 'http://impawards.com/%s/%s' % (year, findRe(html, '<img SRC="(.*?)"'))
data['posterUrl'] = 'http://impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)" alt='))
return data
def archivePosters():
@ -64,16 +59,20 @@ def archivePosters():
from oxlib.net import getUrl
pathname = '/Volumes/Rolux Home/Desktop/Data/impawards.com'
html = getUrlUnicode('http://impawards.com/archives/latest.html', timeout = 0)
pages = int(findRe(html, '<a href = page(.*?).html>'))
pages = int(findRe(html, '<a href= page(.*?).html>'))
for page in range(pages + 1, 0, -1):
print "Page %d of %d" % (page, pages)
if page <= pages:
html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1)
urls = parseArchivePage(html)
print urls
for url in urls:
html = getUrlUnicode(url)
data = parseMoviePage(html)
dirname = '%s/%s/%s' % (pathname, data['imdbId'][:4], data['imdbId'])
print data
if '"' in data['posterUrl']:
print url
sys.exit()
dirname = '%s/%s/%s/%s' % (pathname, data['imdbId'][:1], data['imdbId'][:4], data['imdbId'])
filename = '%s/%s' % (dirname, os.path.split(data['posterUrl'])[1])
if not os.path.exists(filename):
jpg = getUrl(data['posterUrl'])
@ -83,7 +82,15 @@ def archivePosters():
f.write(jpg)
f.close()
def cleanup():
for dirname, dirs, files in os.walk('/Volumes/Rolux Home/Desktop/Data/impawards.com'):
for filename in files:
if '"' in filename:
print filename
os.remove(dirname + '/' + filename)
if __name__ == '__main__':
# cleanup()
archivePosters()
getMovieData('Brick', 'Rian Johnson')