diff --git a/ox/criterion.py b/ox/criterion.py
index 6daf4d7..d1fcb10 100644
--- a/ox/criterion.py
+++ b/ox/criterion.py
@@ -22,16 +22,16 @@ def getMovieId(title = '', director = '', imdbId = ''):
return ''
def getMovieData(title = '', director = '', imdbId = ''):
+ data = {}
if not imdbId:
imdbId = imdb.getMovieId(title, director)
id = getMovieId(imdbId = imdbId)
if id:
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id)
- data = {}
- data['synopsis'] = stripTags(findRe(html, '
Synopsis
(.*?)'))
+ data['id'] = id
data['posterUrl'] = 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % id
- return data
- return {}
+ data['synopsis'] = stripTags(findRe(html, 'Synopsis
(.*?)'))
+ return data
if __name__ == '__main__':
print getMovieData('Le mepris', 'Jean-Luc Godard')
\ No newline at end of file
diff --git a/ox/impawards.py b/ox/impawards.py
new file mode 100644
index 0000000..71881d5
--- /dev/null
+++ b/ox/impawards.py
@@ -0,0 +1,45 @@
+import re
+
+import ox.imdb as imdb
+from oxutils.cache import getUrlUnicode
+from oxutils.text import findRe
+
+
+def getMovieData(title = '', director = '', imdbId = ''):
+ data = {'posterUrls': []}
+ if not imdbId:
+ imdbId = imdb.getMovieId(title, director)
+ print imdbId
+ html = getUrlUnicode('http://impawards.com/archives/latest.html', timeout = 0)
+ pages = int(findRe(html, ''))
+ for page in range(pages + 1, 0, -1):
+ print page
+ if page <= pages:
+ html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1)
+ urls = parseArchivePage(html)
+ print urls
+ for url in urls:
+ html = getUrlUnicode(url)
+ d = parseMoviePage(html)
+ print d
+ if d['imdbId'] == imdbId:
+ data['posterUrls'].append(d['posterUrl'])
+ print d['posterUrl']
+ data['posterUrls'].sort()
+ return data
+
+def parseArchivePage(html):
+ urls = []
+ results = re.compile('', re.DOTALL).findall(html)
+ for result in results:
+ urls.append('http://impawards.com/%s' % result)
+ return urls
+
+def parseMoviePage(html):
+ data = {}
+ data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
+ data['posterUrl'] = 'http://impawards.com/%s' % findRe(html, '
![]((.*?)) |