# -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 from oxutils import * import urllib2 from urllib import quote, unquote import re, time import os import time from BeautifulSoup import BeautifulSoup import chardet import oxutils from oxutils import stripTags, decodeHtml, findRe, findString from oxutils.cache import getUrl, getUrlUnicode from oxutils.normalize import normalizeTitle, normalizeImdbId import google def getMovieId(title, director='', year=''): ''' >>> getMovieId('The Matrix') '0133093' ''' if year: title = "%s (%s)" % (title, year) if director: query = 'site:imdb.com %s "%s"' % (director, title) else: query = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(query, 3): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] return '' def getMovieData(imdbId): return IMDb(imdbId).parse() # internal functions below def getUrlBase(imdbId): return "http://www.imdb.com/title/tt%s" % imdbId def getRawMovieData(imdbId): imdbId = normalizeImdbId(imdbId) data = getMovieInfo(imdbId) data['credits'] = getMovieCredits(imdbId) data['poster'] = getMoviePoster(imdbId) data['connections'] = getMovieConnections(imdbId) data['company credits'] = getMovieCompanyCredits(imdbId) data['filming locations'] = getMovieLocations(imdbId) data['movie connections'] = getMovieConnections(imdbId) data['external reviews'] = getMovieExternalReviews(imdbId) data['trivia'] = getMovieTrivia(imdbId) data['keywords'] = getMovieKeywords(imdbId) data['media'] = {} data['media']['images'] = getMovieImages(imdbId) data['media']['trailers'] = getMovieTrailers(imdbId) data['plotsummary'] = getMoviePlot(imdbId) return data def getMovieInfo(imdbId): data = getUrlUnicode(getUrlBase(imdbId)) info = dict() info['poster'] = findRe(data, 'name="poster".*?(.*?):(.*?)
(.*?)
') print html_title if not html_title: html_title = findRe(data, '(.*?)') if html_title: html_title = html_title.replace('
', ' ').replace(' ', ' ') title = decodeHtml(html_title) title = stripTags(title) year = findRe(title, '\((\d{4})\)') if not year: year = findRe(title, '\((\d{4})') _y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))') if _y: title = title.replace(_y, '') for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') title = title.strip() if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')].strip() if title.startswith('"') and title.endswith('"'): title = title[1:-1] info['title'] = title info['year'] = year #Rating rating = findRe(data, '([\d\.]*?)/10') if rating: info['rating'] = float(rating) else: info['rating'] = -1 #Votes votes = findRe(data, '\((.*?) votes\)') if votes: info['votes'] = int(votes.replace(',', '')) else: info['votes'] = -1 return info def getMoviePoster(imdbId): info = getMovieInfo(imdbId) return info['poster'] def getMovieYear(imdbId): info = getMovieInfo(imdbId) return info['year'] def getMovieTitle(imdbId): info = getMovieInfo(imdbId) return info['title'] def creditList(data, section=None): if section == 'cast': credits_ = re.compile('''(.*?).*?(.*?)''').findall(data) else: credits_ = re.compile('''.*?(.*?)(.*?)''').findall(data) credits = [] for c_ in credits_: c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()] if section=='writers': c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','') if c[1].endswith(' and'): c[1] = c[1][:-4] credits.append(c) return credits def getMovieCredits(imdbId): credits = dict() url = "%s/fullcredits" % getUrlBase(imdbId) data = getUrlUnicode(url) groups = data.split('
') for g in groups: section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g) if section: credits[section[0]] = creditList(g, section[0]) return credits def getMovieTrailers(imdbId): url = "%s/trailers" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) videos = soup('div', {'class':"video-gallery"}) trailers = [] if videos: for a in videos[0]('a'): title = stripTags(unicode(a)).strip() url = 'http://www.imdb.com' + a['href'] videoId = findRe(url, '/(vi\d*?)/') iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId iframe = getUrlUnicode(iframeUrl) videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"')) trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) return trailers def getMovieQuotes(imdbId): url = "%s/quotes" % getUrlBase(imdbId) data = getUrlUnicode(url) quotes = re.compile('(.*?):(.*?)
', re.DOTALL).findall(findString(data, '(.*?)') return plot def getMovieTechnical(imdbId): url = "%s/technical" % getUrlBase(imdbId) data = getUrlUnicode(url) results = {} for t in re.compile('
(.*?)
(.*?)
', re.DOTALL).findall(data): results[t[0].strip()] = t[1].strip() return results def getMovieCompanyCredits(imdbId): url = "%s/companycredits" % getUrlBase(imdbId) data = getUrlUnicode(url) results = {} for field, c in re.compile('

(.*?)

    (.*?)
').findall(data): results[field.strip()] = [] for company in re.compile('
  • (.*?)
  • ').findall(c): results[field.strip()].append(company) return results def getMovieLocations(imdbId): url = "%s/locations" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) locations = [] for key in soup('a', {'href': re.compile('^/List')}): locations.append(decodeHtml(key.string)) return locations def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')): photos = {} for key in keys: url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key) data = getUrlUnicode(url) photos[key] = {} for s in re.compile('''(.*?)', '').strip() if t.startswith('
  • ') and t.endswith('
  • '): t = t[4:-5].strip() t=decodeHtml(t) trivia.append(t) return trivia def getMovieConnections(imdbId): url = "%s/movieconnections" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) connections = {} content = soup('div', {'id': 'tn15content'})[0] blocks = unicode(content).split('
    ')[1:] for c in blocks: connection = c.split('
    ')[0] cs = BeautifulSoup(c) if connection: #relation -> list of imdb ids connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})] return connections def getMovieKeywords(imdbId): url = "%s/keywords" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) keywords = [] for key in soup('a', {'href': re.compile('^/keyword/')}): k = decodeHtml(key.string) k = k.replace(u'\xa0', ' ') keywords.append(k) return keywords def getMovieExternalReviews(imdbId): url = "%s/externalreviews" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) ol = soup('ol') if ol: ol = ol[0] ret = {} for li in ol('li'): try: a = li('a')[0] href = a.get('href') txt = a.contents[0] ret[href] = txt except: pass return ret return {} '''the old code below''' class IMDb: def __init__(self, imdbId): self.imdb = imdbId self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.businessUrl = "%sbusiness" % self.pageUrl self.creditsUrl = "%sfullcredits" % self.pageUrl self.episodesUrl = "%sepisodes" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl def getPage(self): return getUrlUnicode(self.pageUrl) def parse_raw_value(self, key, value): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): value = stripTags(value).strip() if key == 'runtime': parsed_value = findRe(value, '(.*?) min') parsed_value = findRe(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = findRe(value, '(.*?) sec') parsed_value = findRe(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: parsed_value = int(parsed_value) else: parsed_value = int(parsed_value) * 60 elif key in ('country', 'language'): parsed_value = value.split(' / ') if len(parsed_value) == 1: parsed_value = parsed_value[0].split(' | ') parsed_value = [v.strip() for v in parsed_value] elif key == 'genre': parsed_value = value.replace('more', '').strip().split(' / ') if len(parsed_value) == 1: parsed_value = parsed_value[0].split(' | ') parsed_value = [v.strip() for v in parsed_value] elif key == 'tagline': parsed_value = value.replace('more', '').strip() elif key == 'plot_outline': parsed_value = value.replace('(view trailer)', '').strip() if parsed_value.endswith('more'): parsed_value = parsed_value[:-4].strip() elif key == 'tv_series': m = re.compile('
    (.*?)').findall(value) if m: parsed_value = m[0][0] else: parsed_value = '' elif key == 'also_known_as': parsed_value = '' m = re.compile('(.*) \(International: English title').findall(value) if m: parsed_value = m[0] else: m = re.compile('(.*) \(USA').findall(value) if m: parsed_value = m[0] parsed_value = parsed_value.split('
    ')[-1].split('(')[0] director = self.getCredits().get('director', None) if director: director = director[0] parsed_value = parsed_value.replace(director, '') if parsed_value.startswith("'s"): parsed_value = parsed_value[2:].strip() parsed_value = parsed_value.strip() else: print value parsed_value = value return parsed_value def parseTitle(self): title = getMovieTitle(self.imdb) title = normalizeTitle(title) if title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) if se: se = se[0] se = ' (S%02dE%02d)' % (int(se[0]), int(se[1])) title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] else: title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] return normalizeTitle(title) def parseYear(self): year = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = unicode(html_title[0]) html_title = stripTags(html_title) year = re.compile('\((\d{4})\)').findall(html_title) if not year: year = re.compile('\((\d{4})/').findall(html_title) if year: year = year[0] else: year = '' return year def parse(self): data = self.getPage() IMDbDict ={} #Poster IMDbDict['poster'] = getMoviePoster(self.imdb) if not IMDbDict['poster']: IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' #Title, Year IMDbDict['year'] = self.parseYear() IMDbDict['title'] = self.parseTitle() #Rating m = re.compile('(.*?)/10', re.IGNORECASE).search(data) if m: IMDbDict['rating'] = int(float(m.group(1)) * 1000) else: IMDbDict['rating'] = -1 #Votes m = re.compile('\((.*?) votes\)', re.IGNORECASE).findall(data) if m: IMDbDict['votes'] = int(m[0].replace(',', '')) else: IMDbDict['votes'] = -1 data = data.replace('\n',' ') #some values keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') for key in keys: IMDbDict[key] = '' IMDbDict['runtime'] = 0 soup = BeautifulSoup(data) for info in soup('div', {'class': 'info'}): key = unicode(info).split('
    ')[0].split('
    ') if len(key) > 1: raw_value = unicode(info).split('
    ')[1] key = key[1][:-1].lower().replace(' ', '_') if key in keys: IMDbDict[key] = self.parse_raw_value(key, raw_value) IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) #is episode IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = self.parseEpisodes() if IMDbDict['episodes']: IMDbDict['tvshow'] = True else: IMDbDict['tvshow'] = False IMDbDict['credits'] = self.getCredits() IMDbDict['plot'] = getMoviePlot(self.imdb) IMDbDict['keywords'] = getMovieKeywords(self.imdb) IMDbDict['trivia'] = getMovieTrivia(self.imdb) IMDbDict['connections'] = getMovieConnections(self.imdb) IMDbDict['locations'] = getMovieLocations(self.imdb) IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() IMDbDict['reviews'] = getMovieExternalReviews(self.imdb) IMDbDict['stills'] = getMovieStills(self.imdb) #IMDbDict['trailer'] = getMovieTrailer(self.imdb) self.IMDbDict = IMDbDict if IMDbDict['episode_of']: episode_of =IMDb(IMDbDict['episode_of']).parse() for key in ('country', 'language'): if not IMDbDict[key]: IMDbDict[key] = episode_of[key] return self.IMDbDict def getCredits(self): raw_credits = getMovieCredits(self.imdb) credits = {} def getNames(creditList): return [stripTags(c[0]) for c in creditList] credits['director'] = getNames(raw_credits.get('directors', '')) credits['writer'] = getNames(raw_credits.get('writers', '')) credits['producer'] = getNames(raw_credits.get('producers', '')) credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits.get('cast', [])] self.credits = credits return self.credits def parseEpisodes(self): episodes = {} data = getUrlUnicode(self.episodesUrl) cdata = data.replace('\r\n', ' ') regexp = r'''

    Season (.*?), Episode (.*?): (.*?)

    (.*?)
    (.*?)
    ''' reg = re.compile(regexp, re.IGNORECASE) m = reg.findall(cdata) for match in m: try: episode = "S%02dE%02d" % (int(match[0]), int(match[1])) episodes[episode] = {} episodes[episode]['imdb'] = match[2] episodes[episode]['title'] = match[3].strip() if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): episodes[episode]['title'] = u'' description = decodeHtml(match[5]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description episodes[episode]['date'] = '' try: d = stripTags(match[4]) d = d.replace('Original Air Date: ', '') d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) episodes[episode]['date'] = d except: pass except: import traceback print traceback.print_exc() pass self.episodes = episodes return self.episodes def getReleaseinfo(self): return getUrlUnicode(self.releaseinfoUrl) def parseReleaseinfo(self): soup = BeautifulSoup(self.getReleaseinfo()) info = soup('table',{'border': '0', 'cellpadding':'2'}) if info: for row in info[0]('tr'): d = row('td', {'align':'right'}) if d: try: possible_date = stripTags(unicode(d[0])).strip() rdate = time.strptime(possible_date, "%d %B %Y") rdate = time.strftime('%Y-%m-%d', rdate) return rdate except: pass return None def getBusiness(self): return getUrlUnicode(self.businessUrl) def parseBusiness(self): soup = BeautifulSoup(self.getBusiness()) business = {'budget': 0, 'gross': 0, 'profit': 0} content = soup('div', {'id': 'tn15content'})[0] blocks = unicode(content).split('
    ')[1:] for c in blocks: cs = BeautifulSoup(c) line = c.split('
    ') if line: title = line[0] line = line[1] if title in ['Budget', 'Gross']: values = re.compile('\$(.*?) ').findall(line) values = [int(value.replace(',','')) for value in values] if values: business[title.lower()] = max(values) if business['budget'] and business['gross']: business['profit'] = business['gross'] - business['budget'] return business def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0] title = title.split('(')[0] title = title.split('.')[0] title = title.strip() imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) return_url = '' #lest first try google #i.e. site:imdb.com Michael Stevens Sin if director: search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(search, 2): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] try: req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS) u = urllib2.urlopen(req) data = u.read() return_url = u.url u.close() except: return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?
    1. .*?