(.*?)

')[0] else: txt= i[1] txt = stripTags(txt).strip() def cleanUp(k): k = decodeHtml(k).replace(u'\xa0', ' ').strip() if k.endswith('more'): k=k[:-len('more')].strip() return k txt = cleanUp(txt) if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline', 'original air date'): if '|' in txt: txt = [cleanUp(k) for k in txt.split('|')] elif ', ' in txt: txt = [cleanUp(k) for k in txt.split(', ')] elif title in ('country', 'language', 'genre'): txt = [cleanUp(txt), ] if title == 'tv series': info['series_imdb'] = findRe(i[1], 'tt(\d{7})') if title == 'original air date': info['series_episode_info'] = txt.split('\n')[-1].strip() txt = txt.split('\n')[0].strip() if not title.startswith('moviemeter'): info[title] = txt for key in ('user comments', 'writers (wga)', 'plot keywords'): if key in info: del info[key] if 'release date' in info: if isinstance(info['release date'], list): info['release date'] = info['release date'][0] info['release date'] = info['release date'].split('\n')[0] if 'plot' in info: info['plot'] = info['plot'].split('| add synopsis')[0].strip() info['plot'] = info['plot'].split('| full synopsis')[0].strip() if info['plot'] in ('add synopsis', 'full synopsis'): info['plot'] = '' #get Title title = '' year = '' html_title = findRe(data, '

(.*?)

') if not html_title: html_title = findRe(data, '(.*?)') else: html_title = html_title.split('')[0] if html_title: html_title = html_title.replace('
', ' ').replace(' ', ' ') title = stripTags(html_title) title = decodeHtml(title) year = findRe(title, '$(\d{4})$') if not year: year = findRe(title, '$(\d{4})') _y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?$)') if _y: title = title.replace(_y, '') for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') title = title.strip() if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')].strip() if title.startswith('"') and title.endswith('"'): title = title[1:-1] info['title'] = normalizeTitle(title) info['year'] = year #Series if title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): episode_title = title[title.rfind('"')+1:] episode_title = re.sub("\?{4}", "", episode_title).strip() episode_title = re.sub("\d{4}", "", episode_title).strip() if episode_title == '-': episode_title='' title = normalizeTitle(title[1:title.rfind('"')]) if episode_title: info['episode title'] = episode_title info['series title'] = title info['title'] = "%s: %s" % (title, episode_title) else: info['title'] = title se = re.compile("Season (\d*), Episode (\d*)\)").findall(info.get('series_episode_info', '')) if se: info['season'] = int(se[0][0]) info['episode'] = int(se[0][1]) info['title'] = "%s (S%02dE%02d) %s" % ( info['series title'], info['season'], info['episode'], info['episode title']) info['title'] = info['title'].strip() del info['series_episode_info'] #Rating rating = findRe(data, '([\d\.]*?)/10') if rating: info['rating'] = float(rating) else: info['rating'] = -1 #Votes info['votes'] = -1 if "user rating" in info: if isinstance(info['user rating'], list): info['user rating'] = ' '.join(info['user rating']) votes = findRe(info['user rating'], '([\d,]*?) votes') if votes: info['votes'] = int(votes.replace(',', '')) return info def getMovieRuntimeSeconds(imdbId): info = getMovieInfo(imdbId) if 'runtime' in info: value = info['runtime'][0] parsed_value = findRe(value, '(.*?) min') parsed_value = findRe(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = findRe(value, '(.*?) sec') parsed_value = findRe(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: parsed_value = int(parsed_value) else: parsed_value = int(parsed_value) * 60 else: parsed_value = -1 return parsed_value def getMoviePoster(imdbId): info = getMovieInfo(imdbId) return info['poster'] def getMovieYear(imdbId): ''' >>> getMovieYear('0315404') u'1964' >>> getMovieYear('0734840') u'1990' >>> getMovieYear('0815352') u'1964' ''' info = getMovieInfo(imdbId) return info['year'] def getMovieTitle(imdbId): ''' >>> getMovieTitle('0306414') u'The Wire' >>> getMovieTitle('0734840') u'Twin Peaks (S01E02) Episode #1.2' >>> getMovieTitle('0734840') u'Twin Peaks (S01E02) Episode #1.2' >>> getMovieTitle('0749451') u'The Wire (S01E01) The Target' ''' info = getMovieInfo(imdbId) return info['title'] def getMovieAKATitles(imdbId): ''' >>> getMovieAKATitle('0040980') [(u'Frauen der Nacht', u'Germany'), (u'Les femmes de la nuit', u'France'), (u'Women of the Night', u'(undefined)')] ''' url = "%sreleaseinfo" % getUrlBase(imdbId) data = readUrlUnicode(url) titles = findRe(data, 'name="akas".*?(.*?)') titles = re.compile("td>(.*?)\n\n(.*)").findall(titles) return titles def creditList(data, section=None): if section == 'cast': credits_ = re.compile('''(.*?).*?(.*?)''').findall(data) else: credits_ = re.compile('''.*?(.*?)(.*?)''').findall(data) credits = [] for c_ in credits_: c = [stripTags(decodeHtml(c_[0]).strip()), stripTags(decodeHtml(c_[1]).strip())] if section=='writers': c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','') if c[1].endswith(' and'): c[1] = c[1][:-4] credits.append(c) return credits def getMovieCredits(imdbId): credits = dict() url = "%sfullcredits" % getUrlBase(imdbId) data = readUrlUnicode(url) groups = data.split('

') for g in groups: section = re.compile('''name="(.?)".? href="/Glossary''').findall(g) if section: credits[section[0]] = creditList(g, section[0]) return credits def getMovieTrailers(imdbId): from BeautifulSoup import BeautifulSoup url = "%strailers" % getUrlBase(imdbId) data = readUrlUnicode(url) soup = BeautifulSoup(data) videos = soup('div', {'class':"video-gallery"}) trailers = [] if videos: for a in videos[0]('a'): title = stripTags(unicode(a)).strip() url = 'http://www.imdb.com' + a['href'] videoId = findRe(url, '/(vi\d?)/') iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId iframe = readUrlUnicode(iframeUrl) videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.?)"')) trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) return trailers def getMovieQuotes(imdbId): url = "%squotes" % getUrlBase(imdbId) data = readUrlUnicode(url) quotes = re.compile('**(.*?)**:(.?)
', re.DOTALL).findall(findString(data, '(.?)').split('
')[0] return plot.strip() def getMovieTechnical(imdbId): url = "%stechnical" % getUrlBase(imdbId) data = readUrlUnicode(url) results = {} for t in re.compile('
(.?)
(.?)
', re.DOTALL).findall(data): results[t[0].strip()] = t[1].strip() return results def getMovieCompanyCredits(imdbId): url = "%scompanycredits" % getUrlBase(imdbId) data = readUrlUnicode(url) results = {} for field, c in re.compile('
(.?)
(.?)
').findall(data): results[field.strip()] = [] for company in re.compile('
(.?)
').findall(c): results[field.strip()].append(company) return results def getMovieLocations(imdbId): url = "%slocations" % getUrlBase(imdbId) data = readUrlUnicode(url) locations = re.compile('(.?)') trivia = re.compile('
(.?)
', re.DOTALL).findall(data) def clean(t): t = decodeHtml(t) t = t.replace(u'', '"') if t.endswith('

'): t = t[:-8] if t.endswith('
\n
'): t = t[:-len('
\n
')] return t.strip() trivia = [clean(t) for t in trivia] return trivia def getMovieConnections(imdbId): url = "%smovieconnections" % getUrlBase(imdbId) data = readUrlUnicode(url) connections={} for c in re.compile('''
(.?)
(.?)\n\n''', re.DOTALL).findall(data): connections[unicode(c[0])] = re.compile('''''').findall(c[1]) return connections def getMovieKeywords(imdbId): url = "%skeywords" % getUrlBase(imdbId) data = readUrlUnicode(url) keywords = [] for keyword in re.compile('''(.?)''').findall(data): keyword = decodeHtml(keyword) keyword = keyword.replace(u'\xa0', ' ') keywords.append(keyword) return keywords def getMovieExternalReviews(imdbId): url = "%sexternalreviews" % getUrlBase(imdbId) data = readUrlUnicode(url) data = findRe(data, '
(.?)
') _reviews = re.compile('
(.?)
').findall(data) reviews = {} for r in _reviews: reviews[r[0]] = r[1] return reviews def getMovieReleaseDate(imdbId): releasedates = getMovieReleaseDates(imdbId) first_release = None for r in releasedates: if not first_release or r[1] < first_release: first_release = r[1] return first_release def _parseDate(d): ''' >>>_parseDate('3 March 1972') '1972-03-03' ''' try: parsed_date = time.strptime(d, "%d %B %Y") parsed_date = '%s-%02d-%02d' % (parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday) return parsed_date except: try: parsed_date = time.strptime(d, "%B %Y") parsed_date = '%s-%02d-01' % (parsed_date.tm_year, parsed_date.tm_mon) return parsed_date except: pass try: parsed_date = time.strptime(d, "%Y") parsed_date = '%s-01-01' % (parsed_date.tm_year) return parsed_date except: pass return d def getMovieReleaseDates(imdbId): url = "%sreleaseinfo" % getUrlBase(imdbId) data = readUrlUnicode(url) releasedates = [] regexp = '''(.?).?(.?).?(.?)''' for r in re.compile(regexp, re.DOTALL).findall(data): r_ = (stripTags(r[0]).strip(), _parseDate(stripTags(r[1]).strip()), decodeHtml(stripTags(r[2]).strip())) releasedates.append(r_) return releasedates def getMovieBusinessSum(imdbId): business = getMovieBusiness(imdbId) b_ = {'budget': 0, 'gross': 0, 'profit': 0} if 'budget' in business: #b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']]) budget = filter(lambda x: x.startswith('$'), business['budget']) if not budget: budget = business['budget'] b_['budget'] = int(intValue(budget[0].replace(',', ''))) if 'gross' in business: gross = filter(lambda x: x.startswith('$'), business['gross']) if gross: b_['gross'] = int(intValue(gross[0].replace(',', ''))) #b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']]) #if 'weekend gross' in business: # b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']]) if b_['budget'] and b_['gross']: b_['profit'] = b_['gross'] - b_['budget'] return b_ def getMovieFlimingDates(imdbId): business = getMovieBusiness(imdbId) if 'filming dates' in business and business['filming dates']: return business['filming dates'][0] return '' def getMovieBusiness(imdbId): url = "%sbusiness" % getUrlBase(imdbId) data = readUrlUnicode(url) business = {} for r in re.compile('''
(.?)
(.?)
.
''', re.DOTALL).findall(data): key = stripTags(r[0]).strip().lower() value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('
')] business[key] = value return business def getMovieEpisodes(imdbId): url = "%sepisodes" % getUrlBase(imdbId) data = readUrlUnicode(url) episodes = {} regexp = r'''
Season (.?), Episode (.?): (.?)
(.?)
(.?)
''' for r in re.compile(regexp, re.DOTALL).findall(data): try: episode = "S%02dE%02d" % (int(r[0]), int(r[1])) episodes[episode] = {} episodes[episode]['imdb'] = r[2] episodes[episode]['title'] = r[3].strip() if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])): episodes[episode]['title'] = u'' description = decodeHtml(r[5]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description.strip() episodes[episode]['date'] = '' try: d = stripTags(r[4]) d = d.replace('Original Air Date: ', '') d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) episodes[episode]['date'] = d except: pass except: import traceback print traceback.print_exc() pass return episodes '''the old code below''' class IMDb: def init(self, imdbId): self.imdb = imdbId self.pageUrl = getUrlBase(imdbId) def getPage(self): return readUrlUnicode(self.pageUrl) def parse_raw_value(self, key, value): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): value = stripTags(value).strip() if key == 'runtime': parsed_value = getMovieRuntimeSeconds(self.imdb) elif key in ('country', 'language'): parsed_value = value.split(' / ') if len(parsed_value) == 1: parsed_value = parsed_value[0].split(' | ') parsed_value = [v.strip() for v in parsed_value] elif key == 'genre': parsed_value = value.replace('more', '').strip().split(' / ') if len(parsed_value) == 1: parsed_value = parsed_value[0].split(' | ') parsed_value = [v.strip() for v in parsed_value] elif key == 'tagline': parsed_value = value.replace('more', '').strip() elif key == 'plot_outline': parsed_value = value.replace('(view trailer)', '').strip() if parsed_value.endswith('more'): parsed_value = parsed_value[:-4].strip() elif key == 'tv_series': m = re.compile('(.?)').findall(value) if m: parsed_value = m[0][0] else: parsed_value = '' elif key == 'also_known_as': parsed_value = '' m = re.compile('(.) \(International: English title').findall(value) if m: parsed_value = m[0] else: m = re.compile('(.) \(USA').findall(value) if m: parsed_value = m[0] parsed_value = parsed_value.split('
')[-1].split('(')[0] director = self.getCredits().get('director', None) if director: director = director[0] parsed_value = parsed_value.replace(director, '') if parsed_value.startswith("'s"): parsed_value = parsed_value[2:].strip() parsed_value = decodeHtml(parsed_value.strip()) else: print value parsed_value = value return parsed_value def parseYear(self): return getMovieYear(self.imdb) def parse(self): from BeautifulSoup import BeautifulSoup data = self.getPage() IMDbDict ={} info = getMovieInfo(self.imdb) #Poster IMDbDict['poster'] = getMoviePoster(self.imdb) if not IMDbDict['poster']: IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' #Title, Year IMDbDict['year'] = self.parseYear() IMDbDict['title'] = getMovieTitle(self.imdb) #Rating #FIXME: in the future this could be just: #m = findRe(data, '(.?)') m = re.compile('**(.*?)/10**', re.IGNORECASE).search(data) if m: r = stripTags(m.group(1)) if r: IMDbDict['rating'] = int(float(r) * 1000) else: IMDbDict['rating'] = -1 else: IMDbDict['rating'] = -1 #Votes IMDbDict['votes'] = info['votes'] data = data.replace('\n',' ') #some values keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') for key in keys: IMDbDict[key] = '' IMDbDict['runtime'] = 0 soup = BeautifulSoup(data) for info in soup('div', {'class': 'info'}): key = unicode(info).split('

')[0].split('
') if len(key) > 1: raw_value = unicode(info).split('
')[1] key = key[1][:-1].lower().replace(' ', '_') if key in keys: IMDbDict[key] = self.parse_raw_value(key, raw_value) IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) #is episode IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = getMovieEpisodes(self.imdb) if IMDbDict['episodes']: IMDbDict['tvshow'] = True else: IMDbDict['tvshow'] = False IMDbDict['credits'] = self.getCredits() IMDbDict['plot'] = getMoviePlot(self.imdb) IMDbDict['keywords'] = getMovieKeywords(self.imdb) IMDbDict['trivia'] = getMovieTrivia(self.imdb) IMDbDict['connections'] = getMovieConnections(self.imdb) IMDbDict['locations'] = getMovieLocations(self.imdb) IMDbDict['release_date'] = getMovieReleaseDate(self.imdb) IMDbDict['business'] = getMovieBusinessSum(self.imdb) IMDbDict['reviews'] = getMovieExternalReviews(self.imdb) IMDbDict['stills'] = getMovieStills(self.imdb) #IMDbDict['trailer'] = getMovieTrailer(self.imdb) self.IMDbDict = IMDbDict if IMDbDict['episode_of']: episode_of = getMovieInfo(IMDbDict['episode_of']) for key in ('country', 'language'): if not IMDbDict[key]: IMDbDict[key] = episode_of[key] return self.IMDbDict def getCredits(self): raw_credits = getMovieCredits(self.imdb) credits = {} def getNames(creditList): return [stripTags(decodeHtml(c[0])) for c in creditList] credits['director'] = getNames(raw_credits.get('directors', '')) credits['writer'] = getNames(raw_credits.get('writers', '')) credits['producer'] = getNames(raw_credits.get('producers', '')) credits['cinematographer'] = getNames(raw_credits.get('cinematographers', '')) credits['editor'] = getNames(raw_credits.get('editors', '')) credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])] self.credits = credits return self.credits def guess(title, director='', timeout=google.DEFAULT_TIMEOUT): #FIXME: proper file -> title title = title.split('-')[0] title = title.split('(')[0] title = title.split('.')[0] title = title.strip() imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) return_url = '' #lest first try google #i.e. site:imdb.com Michael Stevens Sin if director: search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(search, 2, timeout=timeout): if url.startswith('http://www.imdb.com/title/tt'): return normalizeImdbId(int(oxlib.intValue(url))) try: req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS) u = urllib2.urlopen(req) data = u.read() return_url = u.url u.close() except: return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?
.*?(.*?)') filmo = data.split(u'
Additional Details
')[0] movies = {} for part in filmo.split(u'
(.*?):') section = decodeHtml(section) movies[section] = re.compile(u'href="/title/tt(\d{7})/"').findall(part) info['movies'] = movies return info if __name__ == '__main__': import sys #print parse(sys.argv[1]) print "imdb:", guess(sys.argv[1])

(.*?)

(.*?)

(.*?)

(.*?)

Season (.*?), Episode (.*?): (.*?)

') if len(key) > 1: raw_value = unicode(info).split('

Additional Details

Season (.?), Episode (.?): (.*?)