new criterion module
This commit is contained in:
2 changed files with 98 additions and 51 deletions
@ -4,62 +4,102 @@ import re
from oxlib.cache import getUrlUnicode
from oxlib.html import stripTags
from import getUrl
from oxlib.text import findRe, removeSpecialCharacters
import imdb
def getData(criterionId):
def getIds():
ids = []
html = getUrlUnicode("")
results = re.compile("page=(.*?)\"").findall(html)
pages = int(results[len(results) - 2])
for page in range(1, pages + 1):
html = getUrlUnicode("" + str(page))
results = re.compile("films/(.*?)\"").findall(html)
for result in results:
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = getUrlUnicode("" + result)
results = re.compile("films/(.*?)\"").findall(html)
for result in results:
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
def getData(id):
>>> getData(348)['imdbId']
>>> getData('1333')['imdbId']
>>> getData('236')['posterUrl']
>>> getData('786')['posterUrl']
data = {}
html = getUrlUnicode('' % criterionId)
data['criterionId'] = criterionId
data['posterUrl'] = getPosterUrl(criterionId)
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
result = re.compile("<title>The Criterion Collection: (.*?) by (.*?)</title>").findall(html)
data['title'] = stripTags(result[0][0])
data['director'] = stripTags(result[0][1])
data['imdbId'] = imdb.getMovieId(data['title'], data['director'])
data['id'] = id
html = getUrlUnicode("" + id)
html = getUrl("" + id)
data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>")
data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>")
data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>")
results = re.compile("<p><strong>(.*?)</strong></p>").findall(html)
data["country"] = results[0]
data["year"] = results[1]
result = findRe(html, "<div class=\"synopsis contentbox lightgray\">(.*?)</div>")
data["synopsis"] = findRe(result, "<p>(.*?)</p>")
result = findRe(html, "<div class=\"editioninfo\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
result = re.compile("<div class=\"editioninfo\">(.*?)</div>", re.DOTALL).findall(html)[1]
result = findRe(result, "<a href=\"(.*?)\">")
if not "/boxsets/" in result:
data["posterUrl"] = result
html_ = getUrlUnicode(result)
result = findRe(html_, "<a href=\"\">(.*?)</a>" % id)
result = findRe(result, "src=\"(.*?)\"")
data["posterUrl"] = result.replace("_w100", "")
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result:
data["stillUrl"] = result
data["trailerUrl"] = ""
data["stillUrl"] = findRe(html, "\"thumbnailURL\", \"(.*?)\"")
data["trailerUrl"] = findRe(html, "\"videoURL\", \"(.*?)\"")
data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year'])
return data
def getCriterionIds():
html = getUrlUnicode('')
return re.compile('release.asp\?id=(.*?)"').findall(html)
def getPosterUrl(id):
data = getData(id)
return data['posterUrl']
def getPosterUrl(criterionId):
return '' % criterionId
def getMovieId(title = '', director = '', imdbId = ''):
def getMovieId(title = '', director = '', year = '', imdbId = ''):
if not imdbId:
imdbId = imdb.getMovieId(title, director)
html = getUrlUnicode('', timeout = 86400)
strings = findRe(html, '<table cellspacing="0" id="browse-all-table">(.*?)</table>').split('<tr>')
for string in strings:
id = findRe(string, '"release.asp\?id=(.*?)"')
criterionTitle = findRe(string, 'class="title">(.*?)</a>')
criterionTitle = re.sub('(?<=\\w)<br>(?=\\w)', ' / ', criterionTitle)
criterionTitle = criterionTitle.replace('<br>', '')
criterionDirector = stripTags(findRe(string, '</a>.*?</td>(.*?)</td>')).strip()
if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId:
imdbId = imdb.getMovieId(title, director, year)
ids = getIds()
for id in ids:
data = getData(id)
if imdb.getMovieId(data['title'], data['director'], data['year'] == imdbId):
return id
return ''
def getMovieData(title = '', director = '', imdbId = ''):
def getMovieData(title = '', director = '', year = '', imdbId = ''):
>>> getMovieData('Le mepris', 'Jean-Luc Godard')['id']
>>> getMovieData('Pierrot le fou', 'Jean-Luc Godard', '1965')['id']
data = {}
if not imdbId:
imdbId = imdb.getMovieId(title, director)
imdbId = imdb.getMovieId(title, director, year)
id = getMovieId(imdbId = imdbId)
if id:
html = getUrlUnicode('' % id)
data['id'] = id
data['posterUrl'] = getPosterUrl(id)
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
data_ = getData(id)
data['id'] = data_['id']
data['posterUrl'] = data_['posterUrl']
data['synopsis'] = data_['synopsis']
return data
@ -1,5 +1,6 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import os
import re
from oxlib.cache import getUrlUnicode
@ -42,21 +43,15 @@ def parseArchivePage(html):
def parseMoviePage(html):
data = {}
data['imdbId'] = findRe(html, '*?) ')
data['title'] = stripTags(findRe(html, '<table WIDTH="400" BGCOLOR="#222222">(.*?) \(<a href="eligible.html">'))
data['year'] = findRe(html, '\(<a href="eligible.html">(.*?)</a>\)')
result = findRe(html, '<a href = (\w*?_xlg.html) target= _blank>')
data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
result = findRe(html, '<a href = (\w*?_xlg.html)')
if result:
url = '' % (data['year'], result)
html = getUrlUnicode(url, timeout = -1)
d = parsePosterPage(html, data['year'])
data['posterUrl'] = d['posterUrl']
data['posterUrl'] = '' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
data['posterUrl'] = '' % (data['year'], findRe(html, '<td align=center><br><img SRC="(.*?)"'))
return data
def parsePosterPage(html, year):
data = {}
data['posterUrl'] = '' % (year, findRe(html, '<img SRC="(.*?)"'))
data['posterUrl'] = '' % (data['year'], findRe(html, '<img src="(posters.*?)" alt='))
return data
def archivePosters():
@ -64,16 +59,20 @@ def archivePosters():
from import getUrl
pathname = '/Volumes/Rolux Home/Desktop/Data/'
html = getUrlUnicode('', timeout = 0)
pages = int(findRe(html, '<a href = page(.*?).html>'))
pages = int(findRe(html, '<a href= page(.*?).html>'))
for page in range(pages + 1, 0, -1):
print "Page %d of %d" % (page, pages)
if page <= pages:
html = getUrlUnicode('' % page, timeout = -1)
urls = parseArchivePage(html)
print urls
for url in urls:
html = getUrlUnicode(url)
data = parseMoviePage(html)
dirname = '%s/%s/%s' % (pathname, data['imdbId'][:4], data['imdbId'])
print data
if '"' in data['posterUrl']:
print url
dirname = '%s/%s/%s/%s' % (pathname, data['imdbId'][:1], data['imdbId'][:4], data['imdbId'])
filename = '%s/%s' % (dirname, os.path.split(data['posterUrl'])[1])
if not os.path.exists(filename):
jpg = getUrl(data['posterUrl'])
@ -83,7 +82,15 @@ def archivePosters():
def cleanup():
for dirname, dirs, files in os.walk('/Volumes/Rolux Home/Desktop/Data/'):
for filename in files:
if '"' in filename:
print filename
os.remove(dirname + '/' + filename)
if __name__ == '__main__':
# cleanup()
getMovieData('Brick', 'Rian Johnson')
Add table
Reference in a new issue