', ' ').replace(' ', ' ') title = htmldecode(html_title) title = stripTags(title) title = re.sub('$\d\d\d\d$', '', title) title = re.sub('$\d\d\d\d/I*$', '', title) for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') title = title.strip() if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')].strip() if title.startswith('"') and title.endswith('"'): title = title[1:-1] info['title'] = title return info def getPoster(imdbId): info = parseBase(imdbId) return info['poster'] def getTitle(imdbId): info = parseBase(imdbId) return info['title'] def creditList(data, section=None): if section == 'cast': credits_ = re.compile('''(.*?).*?(.*?)''').findall(data) else: credits_ = re.compile('''.*?(.*?)(.*?)''').findall(data) credits = [] for c_ in credits_: c = [c_[0].strip(), c_[1].strip()] if section=='writers': c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','') if c[1].endswith(' and'): c[1] = c[1][:-4] credits.append(c) return credits def getCredits(imdbId): credits = dict() url = "%s/fullcredits" % getUrlBase(imdbId) data = getUrlUnicode(url) groups = data.split('

') for g in groups: section = re.compile('''name="(.?)".? href="/Glossary''').findall(g) if section: credits[section[0]] = creditList(g, section[0]) return credits def getMovieTrailers(imdbId): url = "%s/trailers" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) videos = soup('div', {'class':"video-gallery"}) trailers = [] if videos: for a in videos[0]('a'): title = stripTags(unicode(a)).strip() url = 'http://www.imdb.com' + a['href'] videoId = findRegexp(url, '/(vi\d?)/') iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId iframe = getUrlUnicode(iframeUrl) videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.?)"')) trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) return trailers def getMovieQuotes(imdbId): url = "%s/quotes" % getUrlBase(imdbId) data = getUrlUnicode(url) quotes = re.compile('**(.*?)**:(.*?)
', re.DOTALL).findall(data) quotes = [(q[0].strip(),q[1].strip()) for q in quotes] return quotes def getMovieTechnical(imdbId): url = "%s/technical" % getUrlBase(imdbId) data = getUrlUnicode(url) results = {} for t in re.compile('

(.*?)

(.*?)
', re.DOTALL).findall(data): results[t[0].strip()] = t[1].strip() return results def getMovieCompanyCredits(imdbId): url = "%s/companycredits" % getUrlBase(imdbId) data = getUrlUnicode(url) results = {} for field, c in re.compile('

(.*?)

(.*?)').findall(data): results[field.strip()] = [] for company in re.compile('

(.*?)

').findall(c): results[field.strip()].append(company) return results def getMovieLocations(imdbId): url = "%s/locations" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) locations = [] for key in soup('a', {'href': re.compile('^/List')}): locations.append(htmldecode(key.string)) return locations def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')): photos = {} for key in keys: url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key) data = getUrlUnicode(url) photos[key] = {} for s in re.compile(''' (.*?)

', '').strip() if t.startswith('

') and t.endswith('

'): t = t[4:-5].strip() trivia.append(t) return trivia '''the old code below''' class IMDb: def __init__(self, imdbId): self.imdb = imdbId self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.businessUrl = "%sbusiness" % self.pageUrl self.connectionsUrl = "%smovieconnections" % self.pageUrl self.creditsUrl = "%sfullcredits" % self.pageUrl self.episodesUrl = "%sepisodes" % self.pageUrl self.keywordUrl = "%skeywords" % self.pageUrl self.plotUrl = "%splotsummary" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.locationUrl = "%slocations" % self.pageUrl self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl def getPage(self): return getUrlUnicode(self.pageUrl) def parse_raw_value(self, key, value): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): value = unicode(value, 'utf-8') value = stripTags(value).strip() if key == 'runtime': parsed_value = findRegexp(value, '(.*?) min') parsed_value = findRegexp(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = findRegexp(value, '(.*?) sec') parsed_value = findRegexp(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: parsed_value = int(parsed_value) else: parsed_value = int(parsed_value) * 60 elif key in ('country', 'language'): parsed_value = value.split(' / ') parsed_value = [v.strip() for v in parsed_value] elif key == 'genre': parsed_value = value.replace('more', '').strip().split(' / ') parsed_value = [v.strip() for v in parsed_value] elif key == 'tagline': parsed_value = value.replace('more', '').strip() elif key == 'plot_outline': parsed_value = value.replace('(view trailer)', '').strip() if parsed_value.endswith('more'): parsed_value = parsed_value[:-4].strip() elif key == 'tv_series': m = re.compile('(.*?)').findall(value) if m: parsed_value = m[0][0] else: parsed_value = '' elif key == 'also_known_as': parsed_value = '' m = re.compile('(.*) $International: English title').findall(value) if m: parsed_value = m[0] else: m = re.compile('(.*) \(USA').findall(value) if m: parsed_value = m[0] parsed_value = parsed_value.split('
')[-1].split('(')[0] director = self.getCredits().get('director', None) if director: director = director[0] parsed_value = parsed_value.replace(director, '') if parsed_value.startswith("'s"): parsed_value = parsed_value[2:].strip() parsed_value = parsed_value.strip() else: print value parsed_value = value return parsed_value def parseTitle(self): title = getTitle(self.imdb) title = normalizeTitle(title) if title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): se = re.compile("Season (\d*), Episode (\d*)$").findall(data) if se: se = se[0] se = ' (S%02dE%02d)' % (int(se[0]), int(se[1])) title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] else: title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] return normalizeTitle(title) def parseYear(self): year = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = str(html_title[0]) html_title = stripTags(html_title) year = re.compile('$(\d{4})$').findall(html_title) if not year: year = re.compile('$(\d{4})/').findall(html_title) if year: year = year[0] else: year = '' return year def parse(self): data = self.getPage() IMDbDict ={} #Poster IMDbDict['poster'] = getPoster(self.imdb) if not IMDbDict['poster']: IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' #Title, Year IMDbDict['year'] = self.parseYear() IMDbDict['title'] = self.parseTitle() #Rating m = re.compile('(.*?)/10', re.IGNORECASE).search(data) if m: IMDbDict['rating'] = int(float(m.group(1)) * 1000) else: IMDbDict['rating'] = -1 #Votes m = re.compile('\((.*?) votes$', re.IGNORECASE).findall(data) if m: IMDbDict['votes'] = int(m[0].replace(',', '')) else: IMDbDict['votes'] = -1 data = data.replace('\n',' ') #some values keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') for key in keys: IMDbDict[key] = '' IMDbDict['runtime'] = 0 soup = BeautifulSoup(data) for info in soup('div', {'class': 'info'}): key = str(info).split('')[0].split('

') if len(key) > 1: raw_value = str(info).split('

')[1] key = key[1][:-1].lower().replace(' ', '_') if key in keys: IMDbDict[key] = self.parse_raw_value(key, raw_value) IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) #is episode IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = self.parseEpisodes() if IMDbDict['episodes']: IMDbDict['tvshow'] = True else: IMDbDict['tvshow'] = False IMDbDict['credits'] = self.getCredits() IMDbDict['plot'] = self.parsePlot() IMDbDict['keywords'] = self.parseKeywords() IMDbDict['trivia'] = getMovieTrivia(self.imdb) IMDbDict['connections'] = self.parseConnections() IMDbDict['locations'] = self.parseLocations() IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() IMDbDict['reviews'] = self.parseExternalreviews() IMDbDict['stills'] = getMovieStills(self.imdb) #IMDbDict['trailer'] = self.parseTrailer() self.IMDbDict = IMDbDict if IMDbDict['episode_of']: episode_of =IMDb(IMDbDict['episode_of']).parse() for key in ('country', 'language'): if not IMDbDict[key]: IMDbDict[key] = episode_of[key] return self.IMDbDict def getCredits(self): raw_credits = getCredits(self.imdb) credits = {} def getNames(creditList): return [stripTags(c[0]) for c in creditList] credits['director'] = getNames(raw_credits['directors']) credits['writer'] = getNames(raw_credits['writers']) credits['producer'] = getNames(raw_credits['producers']) credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']] self.credits = credits return self.credits def parsePlot(self): data = getUrlUnicode(self.plotUrl) soup = BeautifulSoup(data) plot = soup('p', {'class':'plotpar'}) if plot: plot = unicode(plot[0]).split('')[0] else: plot = u'' plot = stripTags(plot).strip() self.plot = plot return plot def parseEpisodes(self): episodes = {} data = getUrlUnicode(self.episodesUrl) cdata = data.replace('\r\n', ' ') regexp = r'''
Season (.*?), Episode (.*?): (.*?)
(.*?)
(.*?)
''' reg = re.compile(regexp, re.IGNORECASE) m = reg.findall(cdata) for match in m: try: episode = "S%02dE%02d" % (int(match[0]), int(match[1])) episodes[episode] = {} episodes[episode]['imdb'] = match[2] episodes[episode]['title'] = match[3].strip() if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): episodes[episode]['title'] = u'' description = htmldecode(match[5]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description episodes[episode]['date'] = '' try: d = stripTags(match[4]) d = d.replace('Original Air Date: ', '') d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) episodes[episode]['date'] = d except: pass except: import traceback print traceback.print_exc() pass self.episodes = episodes return self.episodes def parseLocations(self): data = getUrlUnicode(self.locationUrl) soup = BeautifulSoup(data) locations = [] for key in soup('a', {'href': re.compile('^/List')}): locations.append(htmldecode(key.string)) self.locations = locations return self.locations def parseKeywords(self): data = getUrlUnicode(self.keywordUrl) soup = BeautifulSoup(data) keywords = [] for key in soup('a', {'href': re.compile('^/keyword/')}): k = htmldecode(key.string) k = k.replace(u'\xa0', ' ') keywords.append(k) self.keywords = keywords return self.keywords def getConnections(self): return getUrlUnicode(self.connectionsUrl) def parseConnections(self): connections = {} soup = BeautifulSoup(self.getConnections()) content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
')[1:] for c in blocks: connection = c.split('
')[0] cs = BeautifulSoup(c) if connection: #relation -> list of imdb ids connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] return connections def getReleaseinfo(self): return getUrlUnicode(self.releaseinfoUrl) def parseReleaseinfo(self): soup = BeautifulSoup(self.getReleaseinfo()) info = soup('table',{'border': '0', 'cellpadding':'2'}) if info: for row in info[0]('tr'): d = row('td', {'align':'right'}) if d: try: possible_date = stripTags(str(d[0])).strip() rdate = time.strptime(possible_date, "%d %B %Y") rdate = time.strftime('%Y-%m-%d', rdate) return rdate except: pass return None def getBusiness(self): return getUrlUnicode(self.businessUrl) def parseBusiness(self): soup = BeautifulSoup(self.getBusiness()) business = {'budget': 0, 'gross': 0, 'profit': 0} content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
')[1:] for c in blocks: cs = BeautifulSoup(c) line = c.split('
') if line: title = line[0] line = line[1] if title in ['Budget', 'Gross']: values = re.compile('\$(.*?) ').findall(line) values = [int(value.replace(',','')) for value in values] if values: business[title.lower()] = max(values) if business['budget'] and business['gross']: business['profit'] = business['gross'] - business['budget'] return business def getExternalreviews(self): return getUrlUnicode(self.externalreviewsUrl) def parseExternalreviews(self): soup = BeautifulSoup(self.getExternalreviews()) ol = soup('ol') if ol: ol = ol[0] ret = {} for li in ol('li'): try: a = li('a')[0] href = a.get('href') txt = a.contents[0] ret[href] = txt except: pass return ret return {} def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0] title = title.split('(')[0] title = title.split('.')[0] title = title.strip() imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) return_url = '' #lest first try google #i.e. site:imdb.com Michael Stevens Sin if director: search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(search, 2): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] try: req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS) u = urllib2.urlopen(req) data = u.read() return_url = u.url u.close() except: return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?
.*?