diff --git a/ox/imdb.py b/ox/imdb.py
index 0fe5dde..91f3298 100644
--- a/ox/imdb.py
+++ b/ox/imdb.py
@@ -57,6 +57,8 @@ def getRawMovieData(imdbId):
data['media']['images'] = getMovieImages(imdbId)
data['media']['trailers'] = getMovieTrailers(imdbId)
data['plotsummary'] = getMoviePlot(imdbId)
+ data['release dates'] = getMovieReleaseDates(imdbId)
+ data['release date'] = getMovieReleaseDate(imdbId)
return data
def getMovieInfo(imdbId):
@@ -294,7 +296,6 @@ def getMovieKeywords(imdbId):
keywords.append(k)
return keywords
-
def getMovieExternalReviews(imdbId):
url = "%s/externalreviews" % getUrlBase(imdbId)
data = getUrlUnicode(url)
@@ -314,6 +315,126 @@ def getMovieExternalReviews(imdbId):
return ret
return {}
+def getMovieReleaseDate(imdbId):
+ releasedates = getMovieReleaseDates(imdbId)
+ first_release = ''
+ for r in releasedates:
+ if not first_release or r[1] < first_release:
+ first_release = r[1]
+ return first_release
+
+def getMovieReleaseDates(imdbId):
+ url = "%s/releaseinfo" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ releasedates = []
+ regexp = '''
(.*?) | .*?(.*?) | .*?(.*?) |
'''
+
+ def _parse_date(d):
+ try:
+ parsed_date = time.strptime(d, "%d %B %Y")
+ parsed_date = time.strftime('%Y-%m-%d', parsed_date)
+ return parsed_date
+ except:
+ return d
+
+ for r in re.compile(regexp, re.DOTALL).findall(data):
+ r_ = (stripTags(r[0]).strip(),
+ _parse_date(stripTags(r[1]).strip()),
+ decodeHtml(stripTags(r[2]).strip()))
+ releasedates.append(r_)
+ return releasedates
+ soup = BeautifulSoup(data)
+ info = soup('table',{'border': '0', 'cellpadding':'2'})
+ if info:
+ for row in info[0]('tr'):
+ d = row('td', {'align':'right'})
+ if d:
+ try:
+ possible_date = stripTags(unicode(d[0])).strip()
+ rdate = time.strptime(possible_date, "%d %B %Y")
+ rdate = time.strftime('%Y-%m-%d', rdate)
+ return rdate
+ except:
+ pass
+ return None
+
+def getMovieBusinessSum(imdbId):
+ business = getMovieBusiness(imdbId)
+ b_ = {'budget': 0, 'gross': 0, 'profit': 0}
+ if 'budget' in business:
+ b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
+ if 'gross' in business:
+ b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
+ if 'weekend gross' in business:
+ b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
+ if b_['budget'] and b_['gross']:
+ b_['profit'] = b_['gross'] - b_['budget']
+ return b_
+
+def getMovieFlimingDates(imdbId):
+ business = getMovieBusiness(imdbId)
+ if 'filming dates' in business and business['filming dates']:
+ return business['filming dates'][0]
+ return ''
+
+def getMovieBusiness(imdbId):
+ url = "%s/business" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ business = {}
+ for r in re.compile('''(.*?)
(.*?)
.
''', re.DOTALL).findall(data):
+ key = stripTags(r[0]).strip().lower()
+ value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('
')]
+ business[key] = value
+ return business
+ soup = BeautifulSoup(data)
+ business = {'budget': 0, 'gross': 0, 'profit': 0}
+ content = soup('div', {'id': 'tn15content'})[0]
+ blocks = unicode(content).split('')[1:]
+ for c in blocks:
+ cs = BeautifulSoup(c)
+ line = c.split('
')
+ if line:
+ title = line[0]
+ line = line[1]
+ if title in ['Budget', 'Gross']:
+ values = re.compile('\$(.*?) ').findall(line)
+ values = [int(value.replace(',','')) for value in values]
+ if values:
+ business[title.lower()] = max(values)
+ if business['budget'] and business['gross']:
+ business['profit'] = business['gross'] - business['budget']
+ return business
+
+def getMovieEpisodes(imdbId):
+ url = "%s/episodes" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ episodes = {}
+ regexp = r'''Season (.*?), Episode (.*?): (.*?)
(.*?)
(.*?)
'''
+ for r in re.compile(regexp, re.DOTALL).findall(data):
+ try:
+ episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
+ episodes[episode] = {}
+ episodes[episode]['imdb'] = r[2]
+ episodes[episode]['title'] = r[3].strip()
+ if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
+ episodes[episode]['title'] = u''
+ description = decodeHtml(r[5])
+ description = stripTags(description.split('Next US airings:')[0])
+ episodes[episode]['description'] = description.strip()
+ episodes[episode]['date'] = ''
+ try:
+ d = stripTags(r[4])
+ d = d.replace('Original Air Date: ', '')
+ d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
+ episodes[episode]['date'] = d
+ except:
+ pass
+ except:
+ import traceback
+ print traceback.print_exc()
+ pass
+ return episodes
+
'''the old code below'''
class IMDb:
@@ -321,11 +442,6 @@ class IMDb:
self.imdb = imdbId
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
- self.businessUrl = "%sbusiness" % self.pageUrl
- self.creditsUrl = "%sfullcredits" % self.pageUrl
- self.episodesUrl = "%sepisodes" % self.pageUrl
- self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
-
def getPage(self):
return getUrlUnicode(self.pageUrl)
@@ -393,13 +509,18 @@ class IMDb:
title = normalizeTitle(title)
if title.startswith('"') and title.find('"',1) > 0 and \
title.find('"',1) == title.rfind('"'):
+ data = self.getPage()
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
if se:
se = se[0]
- se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
- title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
+ se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1]))
+ title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip()
else:
- title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
+ part2 = title[title.rfind('"')+1:]
+ part2 = re.sub("[\d\?-]", "", part2).strip()
+ title = normalizeTitle(title[1:title.rfind('"')])
+ if part2:
+ title += ':' + part2
return normalizeTitle(title)
def parseYear(self):
@@ -462,7 +583,7 @@ class IMDb:
#is episode
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
- IMDbDict['episodes'] = self.parseEpisodes()
+ IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
if IMDbDict['episodes']:
IMDbDict['tvshow'] = True
else:
@@ -474,8 +595,8 @@ class IMDb:
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
IMDbDict['connections'] = getMovieConnections(self.imdb)
IMDbDict['locations'] = getMovieLocations(self.imdb)
- IMDbDict['release_date'] = self.parseReleaseinfo()
- IMDbDict['business'] = self.parseBusiness()
+ IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
+ IMDbDict['business'] = getMovieBusinessSum(self.imdb)
IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
IMDbDict['stills'] = getMovieStills(self.imdb)
#IMDbDict['trailer'] = getMovieTrailer(self.imdb)
@@ -503,80 +624,6 @@ class IMDb:
self.credits = credits
return self.credits
- def parseEpisodes(self):
- episodes = {}
- data = getUrlUnicode(self.episodesUrl)
- cdata = data.replace('\r\n', ' ')
- regexp = r'''Season (.*?), Episode (.*?): (.*?)
(.*?)
(.*?)
'''
- reg = re.compile(regexp, re.IGNORECASE)
- m = reg.findall(cdata)
- for match in m:
- try:
- episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
- episodes[episode] = {}
- episodes[episode]['imdb'] = match[2]
- episodes[episode]['title'] = match[3].strip()
- if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
- episodes[episode]['title'] = u''
- description = decodeHtml(match[5])
- description = stripTags(description.split('Next US airings:')[0])
- episodes[episode]['description'] = description
- episodes[episode]['date'] = ''
- try:
- d = stripTags(match[4])
- d = d.replace('Original Air Date: ', '')
- d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
- episodes[episode]['date'] = d
- except:
- pass
- except:
- import traceback
- print traceback.print_exc()
- pass
- self.episodes = episodes
- return self.episodes
-
- def getReleaseinfo(self):
- return getUrlUnicode(self.releaseinfoUrl)
-
- def parseReleaseinfo(self):
- soup = BeautifulSoup(self.getReleaseinfo())
- info = soup('table',{'border': '0', 'cellpadding':'2'})
- if info:
- for row in info[0]('tr'):
- d = row('td', {'align':'right'})
- if d:
- try:
- possible_date = stripTags(unicode(d[0])).strip()
- rdate = time.strptime(possible_date, "%d %B %Y")
- rdate = time.strftime('%Y-%m-%d', rdate)
- return rdate
- except:
- pass
- return None
-
- def getBusiness(self):
- return getUrlUnicode(self.businessUrl)
-
- def parseBusiness(self):
- soup = BeautifulSoup(self.getBusiness())
- business = {'budget': 0, 'gross': 0, 'profit': 0}
- content = soup('div', {'id': 'tn15content'})[0]
- blocks = unicode(content).split('')[1:]
- for c in blocks:
- cs = BeautifulSoup(c)
- line = c.split('
')
- if line:
- title = line[0]
- line = line[1]
- if title in ['Budget', 'Gross']:
- values = re.compile('\$(.*?) ').findall(line)
- values = [int(value.replace(',','')) for value in values]
- if values:
- business[title.lower()] = max(values)
- if business['budget'] and business['gross']:
- business['profit'] = business['gross'] - business['budget']
- return business
def guess(title, director=''):
#FIXME: proper file -> title