From e01af3fdd7579499e4506f7f386ad05205f88d11 Mon Sep 17 00:00:00 2001 From: Rolux Date: Sat, 4 Jul 2009 12:25:24 +0200 Subject: [PATCH] new criterion module --- oxweb/criterion.py | 114 ++++++++++++++++++++++++++++++--------------- oxweb/impawards.py | 35 ++++++++------ 2 files changed, 98 insertions(+), 51 deletions(-) diff --git a/oxweb/criterion.py b/oxweb/criterion.py index 16deef3..3f2c1c3 100644 --- a/oxweb/criterion.py +++ b/oxweb/criterion.py @@ -4,62 +4,102 @@ import re from oxlib.cache import getUrlUnicode from oxlib.html import stripTags +from oxlib.net import getUrl from oxlib.text import findRe, removeSpecialCharacters import imdb -def getData(criterionId): +def getIds(): + ids = [] + html = getUrlUnicode("http://www.criterion.com/library/dvd") + results = re.compile("page=(.*?)\"").findall(html) + pages = int(results[len(results) - 2]) + for page in range(1, pages + 1): + html = getUrlUnicode("http://www.criterion.com/library/dvd?page=" + str(page)) + results = re.compile("films/(.*?)\"").findall(html) + for result in results: + ids.append(result) + results = re.compile("boxsets/(.*?)\"").findall(html) + for result in results: + html = getUrlUnicode("http://www.criterion.com/boxsets/" + result) + results = re.compile("films/(.*?)\"").findall(html) + for result in results: + ids.append(result) + return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids)))) + +def getData(id): ''' - >>> getData(348)['imdbId'] - '0068205' + >>> getData('1333')['imdbId'] + '0060304' + + >>> getData('236')['posterUrl'] + 'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg' + + >>> getData('786')['posterUrl'] + 'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg' ''' data = {} - html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % criterionId) - data['criterionId'] = criterionId - data['posterUrl'] = getPosterUrl(criterionId) - data['synopsis'] = stripTags(findRe(html, '

Synopsis

(.*?)')) - result = re.compile("The Criterion Collection: (.*?) by (.*?)").findall(html) - data['title'] = stripTags(result[0][0]) - data['director'] = stripTags(result[0][1]) - data['imdbId'] = imdb.getMovieId(data['title'], data['director']) + data['id'] = id + try: + html = getUrlUnicode("http://www.criterion.com/films/" + id) + except: + html = getUrl("http://www.criterion.com/films/" + id) + data["number"] = findRe(html, "

(.*?)

") + data["title"] = findRe(html, "

(.*?)

") + data["director"] = findRe(html, "

(.*?)

") + results = re.compile("

(.*?)

").findall(html) + data["country"] = results[0] + data["year"] = results[1] + result = findRe(html, "
(.*?)
") + data["synopsis"] = findRe(result, "

(.*?)

") + result = findRe(html, "
(.*?)
") + if 'Blu-Ray' in result or 'Essential Art House DVD' in result: + result = re.compile("
(.*?)
", re.DOTALL).findall(html)[1] + result = findRe(result, "") + if not "/boxsets/" in result: + data["posterUrl"] = result + else: + html_ = getUrlUnicode(result) + result = findRe(html_, "(.*?)" % id) + result = findRe(result, "src=\"(.*?)\"") + data["posterUrl"] = result.replace("_w100", "") + result = findRe(html, "\"Film(.*?)').split('') - strings.pop(0) - for string in strings: - id = findRe(string, '"release.asp\?id=(.*?)"') - criterionTitle = findRe(string, 'class="title">(.*?)') - criterionTitle = re.sub('(?<=\\w)
(?=\\w)', ' / ', criterionTitle) - criterionTitle = criterionTitle.replace('
', '') - criterionDirector = stripTags(findRe(string, '.*?(.*?)')).strip() - if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId: + imdbId = imdb.getMovieId(title, director, year) + ids = getIds() + for id in ids: + data = getData(id) + if imdb.getMovieId(data['title'], data['director'], data['year'] == imdbId): return id return '' -def getMovieData(title = '', director = '', imdbId = ''): +def getMovieData(title = '', director = '', year = '', imdbId = ''): ''' - >>> getMovieData('Le mepris', 'Jean-Luc Godard')['id'] - '171' + >>> getMovieData('Pierrot le fou', 'Jean-Luc Godard', '1965')['id'] + '149' ''' data = {} if not imdbId: - imdbId = imdb.getMovieId(title, director) + imdbId = imdb.getMovieId(title, director, year) id = getMovieId(imdbId = imdbId) if id: - html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id) - data['id'] = id - data['posterUrl'] = getPosterUrl(id) - data['synopsis'] = stripTags(findRe(html, '

Synopsis

(.*?)')) + data_ = getData(id) + data['id'] = data_['id'] + data['posterUrl'] = data_['posterUrl'] + data['synopsis'] = data_['synopsis'] return data diff --git a/oxweb/impawards.py b/oxweb/impawards.py index e9e3928..e7a63eb 100644 --- a/oxweb/impawards.py +++ b/oxweb/impawards.py @@ -1,5 +1,6 @@ # vi:si:et:sw=4:sts=4:ts=4 # encoding: utf-8 +import os import re from oxlib.cache import getUrlUnicode @@ -42,21 +43,15 @@ def parseArchivePage(html): def parseMoviePage(html): data = {} data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ') - data['title'] = stripTags(findRe(html, '(.*?) \(')) - data['year'] = findRe(html, '\((.*?)\)') - result = findRe(html, '') + data['title'] = stripTags(findRe(html, '

(.*?) \(')) + data['year'] = findRe(html, '\((.*?)\)') + result = findRe(html, '
))
     return data
 
 def archivePosters():
@@ -64,16 +59,20 @@ def archivePosters():
     from oxlib.net import getUrl
     pathname = ')) + pages = int(findRe(html, '
')) for page in range(pages + 1, 0, -1): + print "Page %d of %d" % (page, pages) if page <= pages: html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1) urls = parseArchivePage(html) - print urls for url in urls: html = getUrlUnicode(url) data = parseMoviePage(html) - dirname = '%s/%s/%s' % (pathname, data['imdbId'][:4], data['imdbId']) + print data + if '"' in data['posterUrl']: + print url + sys.exit() + dirname = '%s/%s/%s/%s' % (pathname, data['imdbId'][:1], data['imdbId'][:4], data['imdbId']) filename = '%s/%s' % (dirname, os.path.split(data['posterUrl'])[1]) if not os.path.exists(filename): jpg = getUrl(data['posterUrl']) @@ -83,7 +82,15 @@ def archivePosters(): f.write(jpg) f.close() +def cleanup(): + for dirname, dirs, files in os.walk('/Volumes/Rolux Home/Desktop/Data/impawards.com'): + for filename in files: + if '"' in filename: + print filename + os.remove(dirname + '/' + filename) + if __name__ == '__main__': + # cleanup() archivePosters() getMovieData('Brick', 'Rian Johnson')