get more things out of IMDb class
This commit is contained in:
parent
55c5cdfa99
commit
454eefb7cb
1 changed files with 133 additions and 86 deletions
219
ox/imdb.py
219
ox/imdb.py
|
@ -57,6 +57,8 @@ def getRawMovieData(imdbId):
|
||||||
data['media']['images'] = getMovieImages(imdbId)
|
data['media']['images'] = getMovieImages(imdbId)
|
||||||
data['media']['trailers'] = getMovieTrailers(imdbId)
|
data['media']['trailers'] = getMovieTrailers(imdbId)
|
||||||
data['plotsummary'] = getMoviePlot(imdbId)
|
data['plotsummary'] = getMoviePlot(imdbId)
|
||||||
|
data['release dates'] = getMovieReleaseDates(imdbId)
|
||||||
|
data['release date'] = getMovieReleaseDate(imdbId)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getMovieInfo(imdbId):
|
def getMovieInfo(imdbId):
|
||||||
|
@ -294,7 +296,6 @@ def getMovieKeywords(imdbId):
|
||||||
keywords.append(k)
|
keywords.append(k)
|
||||||
return keywords
|
return keywords
|
||||||
|
|
||||||
|
|
||||||
def getMovieExternalReviews(imdbId):
|
def getMovieExternalReviews(imdbId):
|
||||||
url = "%s/externalreviews" % getUrlBase(imdbId)
|
url = "%s/externalreviews" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
|
@ -314,6 +315,126 @@ def getMovieExternalReviews(imdbId):
|
||||||
return ret
|
return ret
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def getMovieReleaseDate(imdbId):
|
||||||
|
releasedates = getMovieReleaseDates(imdbId)
|
||||||
|
first_release = ''
|
||||||
|
for r in releasedates:
|
||||||
|
if not first_release or r[1] < first_release:
|
||||||
|
first_release = r[1]
|
||||||
|
return first_release
|
||||||
|
|
||||||
|
def getMovieReleaseDates(imdbId):
|
||||||
|
url = "%s/releaseinfo" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
releasedates = []
|
||||||
|
regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
|
||||||
|
|
||||||
|
def _parse_date(d):
|
||||||
|
try:
|
||||||
|
parsed_date = time.strptime(d, "%d %B %Y")
|
||||||
|
parsed_date = time.strftime('%Y-%m-%d', parsed_date)
|
||||||
|
return parsed_date
|
||||||
|
except:
|
||||||
|
return d
|
||||||
|
|
||||||
|
for r in re.compile(regexp, re.DOTALL).findall(data):
|
||||||
|
r_ = (stripTags(r[0]).strip(),
|
||||||
|
_parse_date(stripTags(r[1]).strip()),
|
||||||
|
decodeHtml(stripTags(r[2]).strip()))
|
||||||
|
releasedates.append(r_)
|
||||||
|
return releasedates
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
info = soup('table',{'border': '0', 'cellpadding':'2'})
|
||||||
|
if info:
|
||||||
|
for row in info[0]('tr'):
|
||||||
|
d = row('td', {'align':'right'})
|
||||||
|
if d:
|
||||||
|
try:
|
||||||
|
possible_date = stripTags(unicode(d[0])).strip()
|
||||||
|
rdate = time.strptime(possible_date, "%d %B %Y")
|
||||||
|
rdate = time.strftime('%Y-%m-%d', rdate)
|
||||||
|
return rdate
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getMovieBusinessSum(imdbId):
|
||||||
|
business = getMovieBusiness(imdbId)
|
||||||
|
b_ = {'budget': 0, 'gross': 0, 'profit': 0}
|
||||||
|
if 'budget' in business:
|
||||||
|
b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
|
||||||
|
if 'gross' in business:
|
||||||
|
b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
|
||||||
|
if 'weekend gross' in business:
|
||||||
|
b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
|
||||||
|
if b_['budget'] and b_['gross']:
|
||||||
|
b_['profit'] = b_['gross'] - b_['budget']
|
||||||
|
return b_
|
||||||
|
|
||||||
|
def getMovieFlimingDates(imdbId):
|
||||||
|
business = getMovieBusiness(imdbId)
|
||||||
|
if 'filming dates' in business and business['filming dates']:
|
||||||
|
return business['filming dates'][0]
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getMovieBusiness(imdbId):
|
||||||
|
url = "%s/business" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
business = {}
|
||||||
|
for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
|
||||||
|
key = stripTags(r[0]).strip().lower()
|
||||||
|
value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('<br/>')]
|
||||||
|
business[key] = value
|
||||||
|
return business
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
business = {'budget': 0, 'gross': 0, 'profit': 0}
|
||||||
|
content = soup('div', {'id': 'tn15content'})[0]
|
||||||
|
blocks = unicode(content).split('<h5>')[1:]
|
||||||
|
for c in blocks:
|
||||||
|
cs = BeautifulSoup(c)
|
||||||
|
line = c.split('</h5>')
|
||||||
|
if line:
|
||||||
|
title = line[0]
|
||||||
|
line = line[1]
|
||||||
|
if title in ['Budget', 'Gross']:
|
||||||
|
values = re.compile('\$(.*?) ').findall(line)
|
||||||
|
values = [int(value.replace(',','')) for value in values]
|
||||||
|
if values:
|
||||||
|
business[title.lower()] = max(values)
|
||||||
|
if business['budget'] and business['gross']:
|
||||||
|
business['profit'] = business['gross'] - business['budget']
|
||||||
|
return business
|
||||||
|
|
||||||
|
def getMovieEpisodes(imdbId):
|
||||||
|
url = "%s/episodes" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
episodes = {}
|
||||||
|
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
|
||||||
|
for r in re.compile(regexp, re.DOTALL).findall(data):
|
||||||
|
try:
|
||||||
|
episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
|
||||||
|
episodes[episode] = {}
|
||||||
|
episodes[episode]['imdb'] = r[2]
|
||||||
|
episodes[episode]['title'] = r[3].strip()
|
||||||
|
if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
|
||||||
|
episodes[episode]['title'] = u''
|
||||||
|
description = decodeHtml(r[5])
|
||||||
|
description = stripTags(description.split('Next US airings:')[0])
|
||||||
|
episodes[episode]['description'] = description.strip()
|
||||||
|
episodes[episode]['date'] = ''
|
||||||
|
try:
|
||||||
|
d = stripTags(r[4])
|
||||||
|
d = d.replace('Original Air Date: ', '')
|
||||||
|
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
|
||||||
|
episodes[episode]['date'] = d
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
print traceback.print_exc()
|
||||||
|
pass
|
||||||
|
return episodes
|
||||||
|
|
||||||
'''the old code below'''
|
'''the old code below'''
|
||||||
|
|
||||||
class IMDb:
|
class IMDb:
|
||||||
|
@ -321,11 +442,6 @@ class IMDb:
|
||||||
self.imdb = imdbId
|
self.imdb = imdbId
|
||||||
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
||||||
|
|
||||||
self.businessUrl = "%sbusiness" % self.pageUrl
|
|
||||||
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
|
||||||
self.episodesUrl = "%sepisodes" % self.pageUrl
|
|
||||||
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
|
||||||
|
|
||||||
def getPage(self):
|
def getPage(self):
|
||||||
return getUrlUnicode(self.pageUrl)
|
return getUrlUnicode(self.pageUrl)
|
||||||
|
|
||||||
|
@ -393,13 +509,18 @@ class IMDb:
|
||||||
title = normalizeTitle(title)
|
title = normalizeTitle(title)
|
||||||
if title.startswith('"') and title.find('"',1) > 0 and \
|
if title.startswith('"') and title.find('"',1) > 0 and \
|
||||||
title.find('"',1) == title.rfind('"'):
|
title.find('"',1) == title.rfind('"'):
|
||||||
|
data = self.getPage()
|
||||||
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
|
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
|
||||||
if se:
|
if se:
|
||||||
se = se[0]
|
se = se[0]
|
||||||
se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
|
se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1]))
|
||||||
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
|
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip()
|
||||||
else:
|
else:
|
||||||
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
|
part2 = title[title.rfind('"')+1:]
|
||||||
|
part2 = re.sub("[\d\?-]", "", part2).strip()
|
||||||
|
title = normalizeTitle(title[1:title.rfind('"')])
|
||||||
|
if part2:
|
||||||
|
title += ':' + part2
|
||||||
return normalizeTitle(title)
|
return normalizeTitle(title)
|
||||||
|
|
||||||
def parseYear(self):
|
def parseYear(self):
|
||||||
|
@ -462,7 +583,7 @@ class IMDb:
|
||||||
#is episode
|
#is episode
|
||||||
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
|
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
|
||||||
|
|
||||||
IMDbDict['episodes'] = self.parseEpisodes()
|
IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
|
||||||
if IMDbDict['episodes']:
|
if IMDbDict['episodes']:
|
||||||
IMDbDict['tvshow'] = True
|
IMDbDict['tvshow'] = True
|
||||||
else:
|
else:
|
||||||
|
@ -474,8 +595,8 @@ class IMDb:
|
||||||
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
||||||
IMDbDict['connections'] = getMovieConnections(self.imdb)
|
IMDbDict['connections'] = getMovieConnections(self.imdb)
|
||||||
IMDbDict['locations'] = getMovieLocations(self.imdb)
|
IMDbDict['locations'] = getMovieLocations(self.imdb)
|
||||||
IMDbDict['release_date'] = self.parseReleaseinfo()
|
IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
|
||||||
IMDbDict['business'] = self.parseBusiness()
|
IMDbDict['business'] = getMovieBusinessSum(self.imdb)
|
||||||
IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
|
IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
|
||||||
IMDbDict['stills'] = getMovieStills(self.imdb)
|
IMDbDict['stills'] = getMovieStills(self.imdb)
|
||||||
#IMDbDict['trailer'] = getMovieTrailer(self.imdb)
|
#IMDbDict['trailer'] = getMovieTrailer(self.imdb)
|
||||||
|
@ -503,80 +624,6 @@ class IMDb:
|
||||||
self.credits = credits
|
self.credits = credits
|
||||||
return self.credits
|
return self.credits
|
||||||
|
|
||||||
def parseEpisodes(self):
|
|
||||||
episodes = {}
|
|
||||||
data = getUrlUnicode(self.episodesUrl)
|
|
||||||
cdata = data.replace('\r\n', ' ')
|
|
||||||
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
|
|
||||||
reg = re.compile(regexp, re.IGNORECASE)
|
|
||||||
m = reg.findall(cdata)
|
|
||||||
for match in m:
|
|
||||||
try:
|
|
||||||
episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
|
|
||||||
episodes[episode] = {}
|
|
||||||
episodes[episode]['imdb'] = match[2]
|
|
||||||
episodes[episode]['title'] = match[3].strip()
|
|
||||||
if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
|
|
||||||
episodes[episode]['title'] = u''
|
|
||||||
description = decodeHtml(match[5])
|
|
||||||
description = stripTags(description.split('Next US airings:')[0])
|
|
||||||
episodes[episode]['description'] = description
|
|
||||||
episodes[episode]['date'] = ''
|
|
||||||
try:
|
|
||||||
d = stripTags(match[4])
|
|
||||||
d = d.replace('Original Air Date: ', '')
|
|
||||||
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
|
|
||||||
episodes[episode]['date'] = d
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
print traceback.print_exc()
|
|
||||||
pass
|
|
||||||
self.episodes = episodes
|
|
||||||
return self.episodes
|
|
||||||
|
|
||||||
def getReleaseinfo(self):
|
|
||||||
return getUrlUnicode(self.releaseinfoUrl)
|
|
||||||
|
|
||||||
def parseReleaseinfo(self):
|
|
||||||
soup = BeautifulSoup(self.getReleaseinfo())
|
|
||||||
info = soup('table',{'border': '0', 'cellpadding':'2'})
|
|
||||||
if info:
|
|
||||||
for row in info[0]('tr'):
|
|
||||||
d = row('td', {'align':'right'})
|
|
||||||
if d:
|
|
||||||
try:
|
|
||||||
possible_date = stripTags(unicode(d[0])).strip()
|
|
||||||
rdate = time.strptime(possible_date, "%d %B %Y")
|
|
||||||
rdate = time.strftime('%Y-%m-%d', rdate)
|
|
||||||
return rdate
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getBusiness(self):
|
|
||||||
return getUrlUnicode(self.businessUrl)
|
|
||||||
|
|
||||||
def parseBusiness(self):
|
|
||||||
soup = BeautifulSoup(self.getBusiness())
|
|
||||||
business = {'budget': 0, 'gross': 0, 'profit': 0}
|
|
||||||
content = soup('div', {'id': 'tn15content'})[0]
|
|
||||||
blocks = unicode(content).split('<h5>')[1:]
|
|
||||||
for c in blocks:
|
|
||||||
cs = BeautifulSoup(c)
|
|
||||||
line = c.split('</h5>')
|
|
||||||
if line:
|
|
||||||
title = line[0]
|
|
||||||
line = line[1]
|
|
||||||
if title in ['Budget', 'Gross']:
|
|
||||||
values = re.compile('\$(.*?) ').findall(line)
|
|
||||||
values = [int(value.replace(',','')) for value in values]
|
|
||||||
if values:
|
|
||||||
business[title.lower()] = max(values)
|
|
||||||
if business['budget'] and business['gross']:
|
|
||||||
business['profit'] = business['gross'] - business['budget']
|
|
||||||
return business
|
|
||||||
|
|
||||||
def guess(title, director=''):
|
def guess(title, director=''):
|
||||||
#FIXME: proper file -> title
|
#FIXME: proper file -> title
|
||||||
|
|
Loading…
Reference in a new issue