# -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 from oxutils import * import urllib2 from urllib import quote import re, time import os import time from BeautifulSoup import BeautifulSoup import chardet import oxutils from oxutils import stripTags, htmldecode, findRegexp from oxutils.cache import getUrl, getUrlUnicode from oxutils.normalize import normalizeTitle, normalizeImdbId import google def getMovieId(title, director='', year=''): if year: title = "%s (%s)" % (title, year) if director: query = 'site:imdb.com %s "%s"' % (director, title) else: query = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(query, 3): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] def getMovieData(imdbId): return IMDb(imdbId).parse() # internal functions below def getUrlBase(imdbId): return "http://www.imdb.com/title/tt%s" % imdbId def getRawMovieData(imdbId): imdbId = normalizeImdbId(imdbId) data = dict() data['credits'] = parseCredits(imdbId) data['poster'] = findRegexp(data, 'name="poster".*? i('h5') ', ' ').replace(' ', ' ') title = htmldecode(html_title) title = stripTags(title) title = re.sub('$\d\d\d\d$', '', title) title = re.sub('$\d\d\d\d/I*$', '', title) for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') title = title.strip() if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')] if title.startswith('"') and title.endswith('"'): title = title[1:-1] return title def creditList(data, section=None): if section == 'cast': credits_ = re.compile('''(.*?).*?(.*?)''').findall(data) else: credits_ = re.compile('''.*?(.*?)(.*?)''').findall(data) credits = [] for c_ in credits_: c = [c_[0].strip(), c_[1].strip()] if section=='writers': c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','') if c[1].endswith(' and'): c[1] = c[1][:-4] credits.append(c) return credits def parseCredits(imdbId): credits = dict() url = "%s/fullcredits" % getUrlBase(imdbId) data = getUrlUnicode(url) groups = data.split('

') for g in groups: section = re.compile('''name="(.?)".? href="/Glossary''').findall(g) if section: credits[section[0]] = creditList(g, section[0]) return credits '''the old code below''' def get_image(url): return getUrl(url) def _castList(data, regexp): soup = re.compile(regexp).findall(data) if soup: soup = BeautifulSoup(soup[0]) names = [] for i in soup('a', {'href': re.compile('/name/nm')}): if i.string: cast = stripTags(i.string) if cast not in names: names.append(cast) return names return [] class IMDb: def init(self, imdbId): self.imdb = imdbId self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.businessUrl = "%sbusiness" % self.pageUrl self.connectionsUrl = "%smovieconnections" % self.pageUrl self.creditsUrl = "%sfullcredits" % self.pageUrl self.episodesUrl = "%sepisodes" % self.pageUrl self.keywordUrl = "%skeywords" % self.pageUrl self.plotUrl = "%splotsummary" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.triviaUrl = "%strivia" % self.pageUrl self.locationUrl = "%slocations" % self.pageUrl self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl self.trailerUrl = "%strailers" % self.pageUrl def getPage(self): return getUrlUnicode(self.pageUrl) def parse_raw_value(self, key, value): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): value = unicode(value, 'utf-8') value = stripTags(value).strip() if key == 'runtime': parsed_value = findRegexp(value, '(.?) min') parsed_value = findRegexp(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = findRegexp(value, '(.?) sec') parsed_value = findRegexp(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: parsed_value = int(parsed_value) else: parsed_value = int(parsed_value) * 60 elif key in ('country', 'language'): parsed_value = value.split(' / ') elif key == 'genre': parsed_value = value.replace('more', '').strip().split(' / ') elif key == 'tagline': parsed_value = value.replace('more', '').strip() elif key == 'plot_outline': parsed_value = value.replace('(view trailer)', '').strip() if parsed_value.endswith('more'): parsed_value = parsed_value[:-4].strip() elif key == 'tv_series': m = re.compile('(.?)').findall(value) if m: parsed_value = m[0][0] else: parsed_value = '' elif key == 'also_known_as': parsed_value = '' m = re.compile('(.) $International: English title').findall(value) if m: parsed_value = m[0] else: m = re.compile('(.) \(USA').findall(value) if m: parsed_value = m[0] parsed_value = parsed_value.split('
')[-1].split('(')[0] director = self.parseCredits().get('director', None) if director: director = director[0] parsed_value = parsed_value.replace(director, '') if parsed_value.startswith("'s"): parsed_value = parsed_value[2:].strip() parsed_value = parsed_value.strip() else: print value parsed_value = value return parsed_value def parseTitle(self): title = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = str(html_title[0]) html_title = html_title.replace('
', ' ').replace(' ', ' ') title = stripTags(html_title) title = re.sub('\(\d{4}$', '', title) title = re.sub('$\d{4}/I$', '', title) for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')] title = normalizeTitle(title.strip()) if title.startswith('"') and title.endswith('"'): title = normalizeTitle(title[1:-1]) elif title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): se = re.compile("Season (\d), Episode (\d)\)").findall(data) if se: se = se[0] se = ' (S%02dE%02d)' % (int(se[0]), int(se[1])) title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] else: title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] return normalizeTitle(title) def parseYear(self): year = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = str(html_title[0]) html_title = stripTags(html_title) year = re.compile('$(\d{4})$').findall(html_title) if not year: year = re.compile('$(\d{4})/').findall(html_title) if year: year = year[0] else: year = '' return year def parse(self): data = self.getPage() IMDbDict ={} #Poster IMDbDict['poster'] = findRegexp(data, 'name="poster".?(.?)/10', re.IGNORECASE).search(data) if m: IMDbDict['rating'] = int(float(m.group(1)) * 1000) else: IMDbDict['rating'] = -1 #Votes m = re.compile('\((.*?) votes$', re.IGNORECASE).findall(data) if m: IMDbDict['votes'] = int(m[0].replace(',', '')) else: IMDbDict['votes'] = -1 data = data.replace('\n',' ') #some values keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') for key in keys: IMDbDict[key] = '' IMDbDict['runtime'] = 0 soup = BeautifulSoup(data) for info in soup('div', {'class': 'info'}): key = str(info).split('

')[0].split('

') if len(key) > 1: raw_value = str(info).split('

')[1] key = key[1][:-1].lower().replace(' ', '_') if key in keys: IMDbDict[key] = self.parse_raw_value(key, raw_value) IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) #is episode IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = self.parseEpisodes() if IMDbDict['episodes']: IMDbDict['tvshow'] = True else: IMDbDict['tvshow'] = False IMDbDict['credits'] = self.parseCredits() IMDbDict['plot'] = self.parsePlot() IMDbDict['keywords'] = self.parseKeywords() IMDbDict['trivia'] = self.parseTrivia() IMDbDict['connections'] = self.parseConnections() IMDbDict['locations'] = self.parseLocations() IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() IMDbDict['reviews'] = self.parseExternalreviews() IMDbDict['stills'] = getMovieStills(self.imdb) #IMDbDict['trailer'] = self.parseTrailer() self.IMDbDict = IMDbDict if IMDbDict['episode_of']: episode_of =IMDb(IMDbDict['episode_of']).parse() for key in ('country', 'language'): if not IMDbDict[key]: IMDbDict[key] = episode_of[key] return self.IMDbDict def parseCredits(self): raw_credits = parseCredits(self.imdb) credits = {} def getNames(creditList): return [stripTags(c[0]) for c in creditList] credits['director'] = getNames(raw_credits['directors']) credits['writer'] = getNames(raw_credits['writers']) credits['producer'] = getNames(raw_credits['producers']) credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']] self.credits = credits return self.credits def parsePlot(self): data = getUrlUnicode(self.plotUrl) soup = BeautifulSoup(data) plot = soup('p', {'class':'plotpar'}) if plot: plot = unicode(plot[0]).split('')[0] else: plot = u'' plot = stripTags(plot).strip() self.plot = plot return plot def parseEpisodes(self): episodes = {} data = getUrlUnicode(self.episodesUrl) cdata = data.replace('\r\n', ' ') regexp = r'''
Season (.*?), Episode (.*?): (.*?)
(.*?)
(.*?)
''' reg = re.compile(regexp, re.IGNORECASE) m = reg.findall(cdata) for match in m: try: episode = "S%02dE%02d" % (int(match[0]), int(match[1])) episodes[episode] = {} episodes[episode]['imdb'] = match[2] episodes[episode]['title'] = match[3].strip() if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): episodes[episode]['title'] = u'' description = htmldecode(match[5]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description episodes[episode]['date'] = '' try: d = stripTags(match[4]) d = d.replace('Original Air Date: ', '') d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) episodes[episode]['date'] = d except: pass except: import traceback print traceback.print_exc() pass self.episodes = episodes return self.episodes def parseLocations(self): data = getUrlUnicode(self.locationUrl) soup = BeautifulSoup(data) locations = [] for key in soup('a', {'href': re.compile('^/List')}): locations.append(htmldecode(key.string)) self.locations = locations return self.locations def parseKeywords(self): data = getUrlUnicode(self.keywordUrl) soup = BeautifulSoup(data) keywords = [] for key in soup('a', {'href': re.compile('^/keyword/')}): k = htmldecode(key.string) k = k.replace(u'\xa0', ' ') keywords.append(k) self.keywords = keywords return self.keywords def parseTrivia(self): data = getUrlUnicode(self.triviaUrl) soup = BeautifulSoup(data) trivia = [] triviaList = [] for i in soup('ul', {'class': "trivia"}): for t in i('li'): t = str(t).replace('
', '').strip() if t.startswith('
') and t.endswith('
'): t = t[4:-5].strip() trivia.append(t) self.trivia = trivia return self.trivia def getConnections(self): return getUrlUnicode(self.connectionsUrl) def parseConnections(self): connections = {} soup = BeautifulSoup(self.getConnections()) content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
')[1:] for c in blocks: connection = c.split('
')[0] cs = BeautifulSoup(c) if connection: #relation -> list of imdb ids connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] return connections def getReleaseinfo(self): return getUrlUnicode(self.releaseinfoUrl) def parseReleaseinfo(self): soup = BeautifulSoup(self.getReleaseinfo()) info = soup('table',{'border': '0', 'cellpadding':'2'}) if info: for row in info[0]('tr'): d = row('td', {'align':'right'}) if d: try: possible_date = stripTags(str(d[0])).strip() rdate = time.strptime(possible_date, "%d %B %Y") rdate = time.strftime('%Y-%m-%d', rdate) return rdate except: pass return None def getBusiness(self): return getUrlUnicode(self.businessUrl) def parseBusiness(self): soup = BeautifulSoup(self.getBusiness()) business = {'budget': 0, 'gross': 0, 'profit': 0} content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
')[1:] for c in blocks: cs = BeautifulSoup(c) line = c.split('
') if line: title = line[0] line = line[1] if title in ['Budget', 'Gross']: values = re.compile('\$(.*?) ').findall(line) values = [int(value.replace(',','')) for value in values] if values: business[title.lower()] = max(values) if business['budget'] and business['gross']: business['profit'] = business['gross'] - business['budget'] return business def getExternalreviews(self): return getUrlUnicode(self.externalreviewsUrl) def parseExternalreviews(self): soup = BeautifulSoup(self.getExternalreviews()) ol = soup('ol') if ol: ol = ol[0] ret = {} for li in ol('li'): try: a = li('a')[0] href = a.get('href') txt = a.contents[0] ret[href] = txt except: pass return ret return {} def getTrailer(self): return getUrlUnicode(self.trailerUrl) def parseTrailer(self): ret = {} soup = BeautifulSoup(self.getTrailer()) for p in soup('p'): if p('a') and p.firstText(): a = p('a')[0] href = a['href'] if href and href.startswith('http'): title = a.string title = title.replace('www.', '') ret[href] = title return ret def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0] title = title.split('(')[0] title = title.split('.')[0] title = title.strip() imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) return_url = '' #lest first try google #i.e. site:imdb.com Michael Stevens Sin if director: search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(search, 2): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] try: req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS) u = urllib2.urlopen(req) data = u.read() return_url = u.url u.close() except: return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?
.*? int(s[1]): stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2])) if not stills: s_ = re.compile(''' int(s[1]): stills.append("http://%sf.jpg" % s[2]) return stills if __name__ == '__main__': import sys #print parse(sys.argv[1]) print "imdb:", guess(sys.argv[1])