diff --git a/oxweb/criterion.py b/oxweb/criterion.py
index 16deef3..3f2c1c3 100644
--- a/oxweb/criterion.py
+++ b/oxweb/criterion.py
@@ -4,62 +4,102 @@ import re
from oxlib.cache import getUrlUnicode
from oxlib.html import stripTags
+from oxlib.net import getUrl
from oxlib.text import findRe, removeSpecialCharacters
import imdb
-def getData(criterionId):
+def getIds():
+ ids = []
+ html = getUrlUnicode("http://www.criterion.com/library/dvd")
+ results = re.compile("page=(.*?)\"").findall(html)
+ pages = int(results[len(results) - 2])
+ for page in range(1, pages + 1):
+ html = getUrlUnicode("http://www.criterion.com/library/dvd?page=" + str(page))
+ results = re.compile("films/(.*?)\"").findall(html)
+ for result in results:
+ ids.append(result)
+ results = re.compile("boxsets/(.*?)\"").findall(html)
+ for result in results:
+ html = getUrlUnicode("http://www.criterion.com/boxsets/" + result)
+ results = re.compile("films/(.*?)\"").findall(html)
+ for result in results:
+ ids.append(result)
+ return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
+
+def getData(id):
'''
- >>> getData(348)['imdbId']
- '0068205'
+ >>> getData('1333')['imdbId']
+ '0060304'
+
+ >>> getData('236')['posterUrl']
+ 'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg'
+
+ >>> getData('786')['posterUrl']
+ 'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg'
'''
data = {}
- html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % criterionId)
- data['criterionId'] = criterionId
- data['posterUrl'] = getPosterUrl(criterionId)
- data['synopsis'] = stripTags(findRe(html, '
Synopsis
(.*?)'))
- result = re.compile("The Criterion Collection: (.*?) by (.*?)").findall(html)
- data['title'] = stripTags(result[0][0])
- data['director'] = stripTags(result[0][1])
- data['imdbId'] = imdb.getMovieId(data['title'], data['director'])
+ data['id'] = id
+ try:
+ html = getUrlUnicode("http://www.criterion.com/films/" + id)
+ except:
+ html = getUrl("http://www.criterion.com/films/" + id)
+ data["number"] = findRe(html, "(.*?)
")
+ data["title"] = findRe(html, "(.*?)
")
+ data["director"] = findRe(html, "(.*?)
")
+ results = re.compile("(.*?)
").findall(html)
+ data["country"] = results[0]
+ data["year"] = results[1]
+ result = findRe(html, "(.*?)
")
+ data["synopsis"] = findRe(result, "(.*?)
")
+ result = findRe(html, "(.*?)
")
+ if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
+ result = re.compile("(.*?)
", re.DOTALL).findall(html)[1]
+ result = findRe(result, "")
+ if not "/boxsets/" in result:
+ data["posterUrl"] = result
+ else:
+ html_ = getUrlUnicode(result)
+ result = findRe(html_, "(.*?)" % id)
+ result = findRe(result, "src=\"(.*?)\"")
+ data["posterUrl"] = result.replace("_w100", "")
+ result = findRe(html, "(.*?)').split('')
- strings.pop(0)
- for string in strings:
- id = findRe(string, '"release.asp\?id=(.*?)"')
- criterionTitle = findRe(string, 'class="title">(.*?)')
- criterionTitle = re.sub('(?<=\\w)
(?=\\w)', ' / ', criterionTitle)
- criterionTitle = criterionTitle.replace('
', '')
- criterionDirector = stripTags(findRe(string, '.*?(.*?)')).strip()
- if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId:
+ imdbId = imdb.getMovieId(title, director, year)
+ ids = getIds()
+ for id in ids:
+ data = getData(id)
+ if imdb.getMovieId(data['title'], data['director'], data['year'] == imdbId):
return id
return ''
-def getMovieData(title = '', director = '', imdbId = ''):
+def getMovieData(title = '', director = '', year = '', imdbId = ''):
'''
- >>> getMovieData('Le mepris', 'Jean-Luc Godard')['id']
- '171'
+ >>> getMovieData('Pierrot le fou', 'Jean-Luc Godard', '1965')['id']
+ '149'
'''
data = {}
if not imdbId:
- imdbId = imdb.getMovieId(title, director)
+ imdbId = imdb.getMovieId(title, director, year)
id = getMovieId(imdbId = imdbId)
if id:
- html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id)
- data['id'] = id
- data['posterUrl'] = getPosterUrl(id)
- data['synopsis'] = stripTags(findRe(html, 'Synopsis
(.*?)'))
+ data_ = getData(id)
+ data['id'] = data_['id']
+ data['posterUrl'] = data_['posterUrl']
+ data['synopsis'] = data_['synopsis']
return data
diff --git a/oxweb/impawards.py b/oxweb/impawards.py
index e9e3928..e7a63eb 100644
--- a/oxweb/impawards.py
+++ b/oxweb/impawards.py
@@ -1,5 +1,6 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
+import os
import re
from oxlib.cache import getUrlUnicode
@@ -42,21 +43,15 @@ def parseArchivePage(html):
def parseMoviePage(html):
data = {}
data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
- data['title'] = stripTags(findRe(html, '