# -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 from oxutils import * import urllib2 from urllib import quote, unquote import re, time import os import time from BeautifulSoup import BeautifulSoup import chardet import oxutils from oxutils import stripTags, htmldecode, findRegexp from oxutils.cache import getUrl, getUrlUnicode from oxutils.normalize import normalizeTitle, normalizeImdbId import google def getMovieId(title, director='', year=''): if year: title = "%s (%s)" % (title, year) if director: query = 'site:imdb.com %s "%s"' % (director, title) else: query = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(query, 3): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] def getMovieData(imdbId): return IMDb(imdbId).parse() # internal functions below def getUrlBase(imdbId): return "http://www.imdb.com/title/tt%s" % imdbId def getRawMovieData(imdbId): imdbId = normalizeImdbId(imdbId) data = dict() data['title'] = getTitle(imdbId) data['credits'] = getCredits(imdbId) data['poster'] = getPoster(imdbId) data['trailers'] = getMovieTrailers(imdbId) def parseBase(imdbId): data = getUrl(getUrlBase(imdbId)) soup = BeautifulSoup(data) info = dict() for i in soup('div', {'class':'info'}): title = i('h5') if title: title=title[0] txt = title.findNext() title = stripTags(unicode(title)) if title.endswith(':'): title = title[:-1] info[title] = htmldecode(stripTags(unicode(txt))) return info return soup def getPoster(imdbId): data = getUrl(getUrlBase(imdbId)) return findRegexp(data, 'name="poster".*?', ' ').replace(' ', ' ') title = htmldecode(html_title) title = stripTags(title) title = re.sub('\(\d\d\d\d\)', '', title) title = re.sub('\(\d\d\d\d/I*\)', '', title) for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') title = title.strip() if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')] if title.startswith('"') and title.endswith('"'): title = title[1:-1] return title def creditList(data, section=None): if section == 'cast': credits_ = re.compile('''(.*?).*?(.*?)''').findall(data) else: credits_ = re.compile('''.*?(.*?)(.*?)''').findall(data) credits = [] for c_ in credits_: c = [c_[0].strip(), c_[1].strip()] if section=='writers': c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','') if c[1].endswith(' and'): c[1] = c[1][:-4] credits.append(c) return credits def getCredits(imdbId): credits = dict() url = "%s/fullcredits" % getUrlBase(imdbId) data = getUrlUnicode(url) groups = data.split('
') for g in groups: section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g) if section: credits[section[0]] = creditList(g, section[0]) return credits def getMovieTrailers(imdbId): url = "%s/trailers" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) videos = soup('div', {'class':"video-gallery"}) trailers = [] if videos: for a in videos[0]('a'): title = stripTags(unicode(a)).strip() url = 'http://www.imdb.com' + a['href'] videoId = findRegexp(url, '/(vi\d*?)/') iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId iframe = getUrlUnicode(iframeUrl) videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"')) trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) return trailers def getMovieStills(imdbId): url = "http://www.imdb.com/gallery/ss/%s" % imdbId data = getUrlUnicode(url) s_ = re.compile(''' int(s[1]): stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2])) if not stills: s_ = re.compile(''' int(s[1]): stills.append("http://%sf.jpg" % s[2]) return stills '''the old code below''' class IMDb: def __init__(self, imdbId): self.imdb = imdbId self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.businessUrl = "%sbusiness" % self.pageUrl self.connectionsUrl = "%smovieconnections" % self.pageUrl self.creditsUrl = "%sfullcredits" % self.pageUrl self.episodesUrl = "%sepisodes" % self.pageUrl self.keywordUrl = "%skeywords" % self.pageUrl self.plotUrl = "%splotsummary" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.triviaUrl = "%strivia" % self.pageUrl self.locationUrl = "%slocations" % self.pageUrl self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl def getPage(self): return getUrlUnicode(self.pageUrl) def parse_raw_value(self, key, value): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): value = unicode(value, 'utf-8') value = stripTags(value).strip() if key == 'runtime': parsed_value = findRegexp(value, '(.*?) min') parsed_value = findRegexp(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = findRegexp(value, '(.*?) sec') parsed_value = findRegexp(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: parsed_value = int(parsed_value) else: parsed_value = int(parsed_value) * 60 elif key in ('country', 'language'): parsed_value = value.split(' / ') parsed_value = [v.strip() for v in parsed_value] elif key == 'genre': parsed_value = value.replace('more', '').strip().split(' / ') parsed_value = [v.strip() for v in parsed_value] elif key == 'tagline': parsed_value = value.replace('more', '').strip() elif key == 'plot_outline': parsed_value = value.replace('(view trailer)', '').strip() if parsed_value.endswith('more'): parsed_value = parsed_value[:-4].strip() elif key == 'tv_series': m = re.compile('(.*?)').findall(value) if m: parsed_value = m[0][0] else: parsed_value = '' elif key == 'also_known_as': parsed_value = '' m = re.compile('(.*) \(International: English title').findall(value) if m: parsed_value = m[0] else: m = re.compile('(.*) \(USA').findall(value) if m: parsed_value = m[0] parsed_value = parsed_value.split('
')[-1].split('(')[0] director = self.getCredits().get('director', None) if director: director = director[0] parsed_value = parsed_value.replace(director, '') if parsed_value.startswith("'s"): parsed_value = parsed_value[2:].strip() parsed_value = parsed_value.strip() else: print value parsed_value = value return parsed_value def parseTitle(self): title = getTitle(self.imdb) title = normalizeTitle(title) if title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) if se: se = se[0] se = ' (S%02dE%02d)' % (int(se[0]), int(se[1])) title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] else: title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] return normalizeTitle(title) def parseYear(self): year = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = str(html_title[0]) html_title = stripTags(html_title) year = re.compile('\((\d{4})\)').findall(html_title) if not year: year = re.compile('\((\d{4})/').findall(html_title) if year: year = year[0] else: year = '' return year def parse(self): data = self.getPage() IMDbDict ={} #Poster IMDbDict['poster'] = getPoster(self.imdb) if not IMDbDict['poster']: IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' #Title, Year IMDbDict['year'] = self.parseYear() IMDbDict['title'] = self.parseTitle() #Rating m = re.compile('(.*?)/10', re.IGNORECASE).search(data) if m: IMDbDict['rating'] = int(float(m.group(1)) * 1000) else: IMDbDict['rating'] = -1 #Votes m = re.compile('\((.*?) votes\)', re.IGNORECASE).findall(data) if m: IMDbDict['votes'] = int(m[0].replace(',', '')) else: IMDbDict['votes'] = -1 data = data.replace('\n',' ') #some values keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') for key in keys: IMDbDict[key] = '' IMDbDict['runtime'] = 0 soup = BeautifulSoup(data) for info in soup('div', {'class': 'info'}): key = str(info).split('
')[0].split('
') if len(key) > 1: raw_value = str(info).split('
')[1] key = key[1][:-1].lower().replace(' ', '_') if key in keys: IMDbDict[key] = self.parse_raw_value(key, raw_value) IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) #is episode IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = self.parseEpisodes() if IMDbDict['episodes']: IMDbDict['tvshow'] = True else: IMDbDict['tvshow'] = False IMDbDict['credits'] = self.getCredits() IMDbDict['plot'] = self.parsePlot() IMDbDict['keywords'] = self.parseKeywords() IMDbDict['trivia'] = self.parseTrivia() IMDbDict['connections'] = self.parseConnections() IMDbDict['locations'] = self.parseLocations() IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() IMDbDict['reviews'] = self.parseExternalreviews() IMDbDict['stills'] = getMovieStills(self.imdb) #IMDbDict['trailer'] = self.parseTrailer() self.IMDbDict = IMDbDict if IMDbDict['episode_of']: episode_of =IMDb(IMDbDict['episode_of']).parse() for key in ('country', 'language'): if not IMDbDict[key]: IMDbDict[key] = episode_of[key] return self.IMDbDict def getCredits(self): raw_credits = getCredits(self.imdb) credits = {} def getNames(creditList): return [stripTags(c[0]) for c in creditList] credits['director'] = getNames(raw_credits['directors']) credits['writer'] = getNames(raw_credits['writers']) credits['producer'] = getNames(raw_credits['producers']) credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']] self.credits = credits return self.credits def parsePlot(self): data = getUrlUnicode(self.plotUrl) soup = BeautifulSoup(data) plot = soup('p', {'class':'plotpar'}) if plot: plot = unicode(plot[0]).split('')[0] else: plot = u'' plot = stripTags(plot).strip() self.plot = plot return plot def parseEpisodes(self): episodes = {} data = getUrlUnicode(self.episodesUrl) cdata = data.replace('\r\n', ' ') regexp = r'''

Season (.*?), Episode (.*?): (.*?)

(.*?)
(.*?)
''' reg = re.compile(regexp, re.IGNORECASE) m = reg.findall(cdata) for match in m: try: episode = "S%02dE%02d" % (int(match[0]), int(match[1])) episodes[episode] = {} episodes[episode]['imdb'] = match[2] episodes[episode]['title'] = match[3].strip() if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): episodes[episode]['title'] = u'' description = htmldecode(match[5]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description episodes[episode]['date'] = '' try: d = stripTags(match[4]) d = d.replace('Original Air Date: ', '') d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) episodes[episode]['date'] = d except: pass except: import traceback print traceback.print_exc() pass self.episodes = episodes return self.episodes def parseLocations(self): data = getUrlUnicode(self.locationUrl) soup = BeautifulSoup(data) locations = [] for key in soup('a', {'href': re.compile('^/List')}): locations.append(htmldecode(key.string)) self.locations = locations return self.locations def parseKeywords(self): data = getUrlUnicode(self.keywordUrl) soup = BeautifulSoup(data) keywords = [] for key in soup('a', {'href': re.compile('^/keyword/')}): k = htmldecode(key.string) k = k.replace(u'\xa0', ' ') keywords.append(k) self.keywords = keywords return self.keywords def parseTrivia(self): data = getUrlUnicode(self.triviaUrl) soup = BeautifulSoup(data) trivia = [] triviaList = [] for i in soup('ul', {'class': "trivia"}): for t in i('li'): t = str(t).replace('
', '').strip() if t.startswith('
  • ') and t.endswith('
  • '): t = t[4:-5].strip() trivia.append(t) self.trivia = trivia return self.trivia def getConnections(self): return getUrlUnicode(self.connectionsUrl) def parseConnections(self): connections = {} soup = BeautifulSoup(self.getConnections()) content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
    ')[1:] for c in blocks: connection = c.split('
    ')[0] cs = BeautifulSoup(c) if connection: #relation -> list of imdb ids connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] return connections def getReleaseinfo(self): return getUrlUnicode(self.releaseinfoUrl) def parseReleaseinfo(self): soup = BeautifulSoup(self.getReleaseinfo()) info = soup('table',{'border': '0', 'cellpadding':'2'}) if info: for row in info[0]('tr'): d = row('td', {'align':'right'}) if d: try: possible_date = stripTags(str(d[0])).strip() rdate = time.strptime(possible_date, "%d %B %Y") rdate = time.strftime('%Y-%m-%d', rdate) return rdate except: pass return None def getBusiness(self): return getUrlUnicode(self.businessUrl) def parseBusiness(self): soup = BeautifulSoup(self.getBusiness()) business = {'budget': 0, 'gross': 0, 'profit': 0} content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
    ')[1:] for c in blocks: cs = BeautifulSoup(c) line = c.split('
    ') if line: title = line[0] line = line[1] if title in ['Budget', 'Gross']: values = re.compile('\$(.*?) ').findall(line) values = [int(value.replace(',','')) for value in values] if values: business[title.lower()] = max(values) if business['budget'] and business['gross']: business['profit'] = business['gross'] - business['budget'] return business def getExternalreviews(self): return getUrlUnicode(self.externalreviewsUrl) def parseExternalreviews(self): soup = BeautifulSoup(self.getExternalreviews()) ol = soup('ol') if ol: ol = ol[0] ret = {} for li in ol('li'): try: a = li('a')[0] href = a.get('href') txt = a.contents[0] ret[href] = txt except: pass return ret return {} def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0] title = title.split('(')[0] title = title.split('.')[0] title = title.strip() imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) return_url = '' #lest first try google #i.e. site:imdb.com Michael Stevens Sin if director: search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(search, 2): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] try: req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS) u = urllib2.urlopen(req) data = u.read() return_url = u.url u.close() except: return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?
    1. .*?