diff --git a/ox/imdb.py b/ox/imdb.py index 0fe5dde..91f3298 100644 --- a/ox/imdb.py +++ b/ox/imdb.py @@ -57,6 +57,8 @@ def getRawMovieData(imdbId): data['media']['images'] = getMovieImages(imdbId) data['media']['trailers'] = getMovieTrailers(imdbId) data['plotsummary'] = getMoviePlot(imdbId) + data['release dates'] = getMovieReleaseDates(imdbId) + data['release date'] = getMovieReleaseDate(imdbId) return data def getMovieInfo(imdbId): @@ -294,7 +296,6 @@ def getMovieKeywords(imdbId): keywords.append(k) return keywords - def getMovieExternalReviews(imdbId): url = "%s/externalreviews" % getUrlBase(imdbId) data = getUrlUnicode(url) @@ -314,6 +315,126 @@ def getMovieExternalReviews(imdbId): return ret return {} +def getMovieReleaseDate(imdbId): + releasedates = getMovieReleaseDates(imdbId) + first_release = '' + for r in releasedates: + if not first_release or r[1] < first_release: + first_release = r[1] + return first_release + +def getMovieReleaseDates(imdbId): + url = "%s/releaseinfo" % getUrlBase(imdbId) + data = getUrlUnicode(url) + releasedates = [] + regexp = '''(.*?).*?(.*?).*?(.*?)''' + + def _parse_date(d): + try: + parsed_date = time.strptime(d, "%d %B %Y") + parsed_date = time.strftime('%Y-%m-%d', parsed_date) + return parsed_date + except: + return d + + for r in re.compile(regexp, re.DOTALL).findall(data): + r_ = (stripTags(r[0]).strip(), + _parse_date(stripTags(r[1]).strip()), + decodeHtml(stripTags(r[2]).strip())) + releasedates.append(r_) + return releasedates + soup = BeautifulSoup(data) + info = soup('table',{'border': '0', 'cellpadding':'2'}) + if info: + for row in info[0]('tr'): + d = row('td', {'align':'right'}) + if d: + try: + possible_date = stripTags(unicode(d[0])).strip() + rdate = time.strptime(possible_date, "%d %B %Y") + rdate = time.strftime('%Y-%m-%d', rdate) + return rdate + except: + pass + return None + +def getMovieBusinessSum(imdbId): + business = getMovieBusiness(imdbId) + b_ = {'budget': 0, 'gross': 0, 'profit': 0} + if 'budget' in business: + b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']]) + if 'gross' in business: + b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']]) + if 'weekend gross' in business: + b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']]) + if b_['budget'] and b_['gross']: + b_['profit'] = b_['gross'] - b_['budget'] + return b_ + +def getMovieFlimingDates(imdbId): + business = getMovieBusiness(imdbId) + if 'filming dates' in business and business['filming dates']: + return business['filming dates'][0] + return '' + +def getMovieBusiness(imdbId): + url = "%s/business" % getUrlBase(imdbId) + data = getUrlUnicode(url) + business = {} + for r in re.compile('''
(.*?)
(.*?)
.
''', re.DOTALL).findall(data): + key = stripTags(r[0]).strip().lower() + value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('
')] + business[key] = value + return business + soup = BeautifulSoup(data) + business = {'budget': 0, 'gross': 0, 'profit': 0} + content = soup('div', {'id': 'tn15content'})[0] + blocks = unicode(content).split('
')[1:] + for c in blocks: + cs = BeautifulSoup(c) + line = c.split('
') + if line: + title = line[0] + line = line[1] + if title in ['Budget', 'Gross']: + values = re.compile('\$(.*?) ').findall(line) + values = [int(value.replace(',','')) for value in values] + if values: + business[title.lower()] = max(values) + if business['budget'] and business['gross']: + business['profit'] = business['gross'] - business['budget'] + return business + +def getMovieEpisodes(imdbId): + url = "%s/episodes" % getUrlBase(imdbId) + data = getUrlUnicode(url) + episodes = {} + regexp = r'''

Season (.*?), Episode (.*?): (.*?)

(.*?)
(.*?)
''' + for r in re.compile(regexp, re.DOTALL).findall(data): + try: + episode = "S%02dE%02d" % (int(r[0]), int(r[1])) + episodes[episode] = {} + episodes[episode]['imdb'] = r[2] + episodes[episode]['title'] = r[3].strip() + if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])): + episodes[episode]['title'] = u'' + description = decodeHtml(r[5]) + description = stripTags(description.split('Next US airings:')[0]) + episodes[episode]['description'] = description.strip() + episodes[episode]['date'] = '' + try: + d = stripTags(r[4]) + d = d.replace('Original Air Date: ', '') + d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) + episodes[episode]['date'] = d + except: + pass + except: + import traceback + print traceback.print_exc() + pass + return episodes + '''the old code below''' class IMDb: @@ -321,11 +442,6 @@ class IMDb: self.imdb = imdbId self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb - self.businessUrl = "%sbusiness" % self.pageUrl - self.creditsUrl = "%sfullcredits" % self.pageUrl - self.episodesUrl = "%sepisodes" % self.pageUrl - self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl - def getPage(self): return getUrlUnicode(self.pageUrl) @@ -393,13 +509,18 @@ class IMDb: title = normalizeTitle(title) if title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): + data = self.getPage() se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) if se: se = se[0] - se = ' (S%02dE%02d)' % (int(se[0]), int(se[1])) - title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] + se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1])) + title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip() else: - title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] + part2 = title[title.rfind('"')+1:] + part2 = re.sub("[\d\?-]", "", part2).strip() + title = normalizeTitle(title[1:title.rfind('"')]) + if part2: + title += ':' + part2 return normalizeTitle(title) def parseYear(self): @@ -462,7 +583,7 @@ class IMDb: #is episode IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') - IMDbDict['episodes'] = self.parseEpisodes() + IMDbDict['episodes'] = getMovieEpisodes(self.imdb) if IMDbDict['episodes']: IMDbDict['tvshow'] = True else: @@ -474,8 +595,8 @@ class IMDb: IMDbDict['trivia'] = getMovieTrivia(self.imdb) IMDbDict['connections'] = getMovieConnections(self.imdb) IMDbDict['locations'] = getMovieLocations(self.imdb) - IMDbDict['release_date'] = self.parseReleaseinfo() - IMDbDict['business'] = self.parseBusiness() + IMDbDict['release_date'] = getMovieReleaseDate(self.imdb) + IMDbDict['business'] = getMovieBusinessSum(self.imdb) IMDbDict['reviews'] = getMovieExternalReviews(self.imdb) IMDbDict['stills'] = getMovieStills(self.imdb) #IMDbDict['trailer'] = getMovieTrailer(self.imdb) @@ -503,80 +624,6 @@ class IMDb: self.credits = credits return self.credits - def parseEpisodes(self): - episodes = {} - data = getUrlUnicode(self.episodesUrl) - cdata = data.replace('\r\n', ' ') - regexp = r'''

Season (.*?), Episode (.*?): (.*?)

(.*?)
(.*?)
''' - reg = re.compile(regexp, re.IGNORECASE) - m = reg.findall(cdata) - for match in m: - try: - episode = "S%02dE%02d" % (int(match[0]), int(match[1])) - episodes[episode] = {} - episodes[episode]['imdb'] = match[2] - episodes[episode]['title'] = match[3].strip() - if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): - episodes[episode]['title'] = u'' - description = decodeHtml(match[5]) - description = stripTags(description.split('Next US airings:')[0]) - episodes[episode]['description'] = description - episodes[episode]['date'] = '' - try: - d = stripTags(match[4]) - d = d.replace('Original Air Date: ', '') - d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) - episodes[episode]['date'] = d - except: - pass - except: - import traceback - print traceback.print_exc() - pass - self.episodes = episodes - return self.episodes - - def getReleaseinfo(self): - return getUrlUnicode(self.releaseinfoUrl) - - def parseReleaseinfo(self): - soup = BeautifulSoup(self.getReleaseinfo()) - info = soup('table',{'border': '0', 'cellpadding':'2'}) - if info: - for row in info[0]('tr'): - d = row('td', {'align':'right'}) - if d: - try: - possible_date = stripTags(unicode(d[0])).strip() - rdate = time.strptime(possible_date, "%d %B %Y") - rdate = time.strftime('%Y-%m-%d', rdate) - return rdate - except: - pass - return None - - def getBusiness(self): - return getUrlUnicode(self.businessUrl) - - def parseBusiness(self): - soup = BeautifulSoup(self.getBusiness()) - business = {'budget': 0, 'gross': 0, 'profit': 0} - content = soup('div', {'id': 'tn15content'})[0] - blocks = unicode(content).split('
')[1:] - for c in blocks: - cs = BeautifulSoup(c) - line = c.split('
') - if line: - title = line[0] - line = line[1] - if title in ['Budget', 'Gross']: - values = re.compile('\$(.*?) ').findall(line) - values = [int(value.replace(',','')) for value in values] - if values: - business[title.lower()] = max(values) - if business['budget'] and business['gross']: - business['profit'] = business['gross'] - business['budget'] - return business def guess(title, director=''): #FIXME: proper file -> title