# -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 from oxutils import * import urllib2 from urllib import quote import re, time import os import time from BeautifulSoup import BeautifulSoup import chardet import oxutils from oxutils import stripTags, htmldecode from oxutils.cache import getUrl, getUrlUnicode from oxutils.normalize import normalizeTitle import google def _get_data(url): data = None try: data = getUrl(url) except: print "error reading data from", url return data def get_image(url): return getUrl(url) def _castList(data, regexp): soup = re.compile(regexp).findall(data) if soup: soup = BeautifulSoup(soup[0]) names = [] for i in soup('a', {'href': re.compile('/name/nm')}): if i.string: cast = stripTags(i.string) if cast not in names: names.append(cast) return names return [] def _getTerm(data, regexp): term = '' try: reg = re.compile(regexp, re.IGNORECASE) m = reg.search(data) if m: term = stripTags(m.group(1)).strip() except: print "waring, parsing failed for", regexp return term.encode('utf8') class IMDb: def __init__(self, imdb): self.imdb = imdb self.pageSource = None self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.businessSource = None self.businessUrl = "%sbusiness" % self.pageUrl self.connectionsSource = None self.connectionsUrl = "%smovieconnections" % self.pageUrl self.creditsSource = None self.creditsUrl = "%sfullcredits" % self.pageUrl self.episodesSource = None self.episodesUrl = "%sepisodes" % self.pageUrl self.keywordSource = None self.keywordUrl = "%skeywords" % self.pageUrl self.plotSource = None self.plotUrl = "%splotsummary" % self.pageUrl self.releaseinfoSource = None self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.triviaSource = None self.triviaUrl = "%strivia" % self.pageUrl self.locationSource = None self.locationUrl = "%slocations" % self.pageUrl self.externalreviewsSource = None self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl self.trailerSource = None self.trailerUrl = "%strailers" % self.pageUrl def getPage(self, forcereload = False): if forcereload or not self.pageSource: self.pageSource = getUrlUnicode(self.pageUrl) return self.pageSource def parse_raw_value(self, key, value): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): value = unicode(value, 'utf-8') value = stripTags(value).strip() if key == 'runtime': parsed_value = _getTerm(value, '(.*?) min') parsed_value = _getTerm(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = _getTerm(value, '(.*?) sec') parsed_value = _getTerm(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: parsed_value = int(parsed_value) else: parsed_value = int(parsed_value) * 60 elif key in ('country', 'language'): parsed_value = value.split(' / ') elif key == 'genre': parsed_value = value.replace('more', '').strip().split(' / ') elif key == 'tagline': parsed_value = value.replace('more', '').strip() elif key == 'plot_outline': parsed_value = value.replace('(view trailer)', '').strip() if parsed_value.endswith('more'): parsed_value = parsed_value[:-4].strip() elif key == 'tv_series': m = re.compile('(.*?)').findall(value) if m: parsed_value = m[0][0] else: parsed_value = '' elif key == 'also_known_as': parsed_value = '' m = re.compile('(.*) $International: English title').findall(value) if m: parsed_value = m[0] else: m = re.compile('(.*) \(USA').findall(value) if m: parsed_value = m[0] parsed_value = parsed_value.split('
')[-1].split('(')[0] director = self.parseCredits().get('director', None) if director: director = director[0] parsed_value = parsed_value.replace(director, '') if parsed_value.startswith("'s"): parsed_value = parsed_value[2:].strip() parsed_value = parsed_value.strip() else: print value parsed_value = value return parsed_value def parseTitle(self): title = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = str(html_title[0]) html_title = html_title.replace('
', ' ').replace(' ', ' ') title = stripTags(html_title) title = re.sub('\(\d\d\d\d$', '', title) title = re.sub('$\d\d\d\d/I*$', '', title) for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')] title = normalizeTitle(title.strip()) if title.startswith('"') and title.endswith('"'): title = normalizeTitle(title[1:-1]) elif title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) if se: se = se[0] se = ' (S%02dE%02d)' % (int(se[0]), int(se[1])) title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] else: title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] return normalizeTitle(title) def parseYear(self): year = '' data = self.getPage() soup = BeautifulSoup(data) html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') if html_title: html_title = str(html_title[0]) html_title = stripTags(html_title) year = re.compile('$(\d\d\d\d)$').findall(html_title) if not year: year = re.compile('$(\d\d\d\d)/').findall(html_title) if year: year = year[0] else: year = '' return year def parse(self): data = self.getPage() IMDbDict ={} #Poster IMDbDict['poster'] = _getTerm(data, 'name="poster".*?(.*?)/10', re.IGNORECASE).search(data) if m: IMDbDict['rating'] = int(float(m.group(1)) * 1000) else: IMDbDict['rating'] = -1 #Votes m = re.compile('\((.*?) votes$', re.IGNORECASE).findall(data) if m: IMDbDict['votes'] = int(m[0].replace(',', '')) else: IMDbDict['votes'] = -1 data = data.replace('\n',' ') #some values keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') for key in keys: IMDbDict[key] = '' IMDbDict['runtime'] = 0 soup = BeautifulSoup(data) for info in soup('div', {'class': 'info'}): key = str(info).split('')[0].split('

') if len(key) > 1: raw_value = str(info).split('

')[1] key = key[1][:-1].lower().replace(' ', '_') if key in keys: IMDbDict[key] = self.parse_raw_value(key, raw_value) IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) #is episode IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = self.parseEpisodes() if IMDbDict['episodes']: IMDbDict['tvshow'] = True else: IMDbDict['tvshow'] = False IMDbDict['credits'] = self.parseCredits() IMDbDict['plot'] = self.parsePlot() IMDbDict['keywords'] = self.parseKeywords() IMDbDict['trivia'] = self.parseTrivia() IMDbDict['connections'] = self.parseConnections() IMDbDict['locations'] = self.parseLocations() IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() IMDbDict['reviews'] = self.parseExternalreviews() IMDbDict['stills'] = getMovieStills(self.imdb) #IMDbDict['trailer'] = self.parseTrailer() self.IMDbDict = IMDbDict if IMDbDict['episode_of']: episode_of =IMDb(IMDbDict['episode_of']).parse() for key in ('country', 'language'): if not IMDbDict[key]: IMDbDict[key] = episode_of[key] return self.IMDbDict def getCredits(self, forcereload = False): if forcereload or not self.creditsSource: self.creditsSource = getUrlUnicode(self.creditsUrl) return self.creditsSource def parseCredits(self): data = self.getCredits() credits = {} credits['director'] = _castList(data, 'Directed by.*?(.*?)') credits['writer'] = _castList(data, 'Writing credits.*?(.*?)') credits['producer'] = _castList(data, 'Produced by.*?(.*?)') #credits['cast'] = _castList(data, 'Cast.*?(') credits['cast'] = [] soup = re.compile('Cast.*?(').findall(data) soup = BeautifulSoup(data) cast = soup('table', {'class': 'cast'}) if cast: cast = str(cast[0]).replace(u'\xa0', ' ') names = re.compile('(.*?).*?(.*?)').findall(cast) for name in names: real_name = name[0] role_name = name[1] if role_name: role_name = role_name.split('(')[0].replace('/ ...','') credits['cast'].append((stripTags(real_name), stripTags(role_name))) self.credits = credits return self.credits def getPlot(self, forcereload = False): if forcereload or not self.plotSource: self.plotSource = getUrlUnicode(self.plotUrl) return self.plotSource def parsePlot(self): soup = BeautifulSoup(self.getPlot()) plot = soup('p', {'class':'plotpar'}) if plot: plot = unicode(plot[0]).split('')[0] else: plot = u'' plot = stripTags(plot).strip() self.plot = plot return plot def getEpisodes(self, forcereload = False): if forcereload or not self.episodesSource: self.episodesSource = getUrlUnicode(self.episodesUrl) return self.episodesSource def parseEpisodes(self): episodes = {} cdata = self.getEpisodes().replace('\r\n', ' ') regexp = r'''
Season (.*?), Episode (.*?): (.*?)
(.*?)
(.*?)
''' reg = re.compile(regexp, re.IGNORECASE) m = reg.findall(cdata) for match in m: try: episode = "S%02dE%02d" % (int(match[0]), int(match[1])) episodes[episode] = {} episodes[episode]['imdb'] = match[2] episodes[episode]['title'] = match[3].strip() if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): episodes[episode]['title'] = u'' description = htmldecode(match[5]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description episodes[episode]['date'] = '' try: d = stripTags(match[4]) d = d.replace('Original Air Date: ', '') d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) episodes[episode]['date'] = d except: pass except: import traceback print traceback.print_exc() pass self.episodes = episodes return self.episodes def getLocations(self, forcereload = False): if forcereload or not self.locationSource: self.keywordSource = getUrlUnicode(self.locationUrl) return self.keywordSource def parseLocations(self): soup = BeautifulSoup(self.getLocations()) locations = [] for key in soup('a', {'href': re.compile('^/List')}): locations.append(htmldecode(key.string)) self.locations = locations return self.locations def getKeywords(self, forcereload = False): if forcereload or not self.keywordSource: self.keywordSource = getUrlUnicode(self.keywordUrl) return self.keywordSource def parseKeywords(self): soup = BeautifulSoup(self.getKeywords()) keywords = [] for key in soup('a', {'href': re.compile('^/keyword/')}): k = htmldecode(key.string) k = k.replace(u'\xa0', ' ') keywords.append(k) self.keywords = keywords return self.keywords def getTrivia(self, forcereload = False): if forcereload or not self.triviaSource: self.triviaSource = getUrlUnicode(self.triviaUrl) return self.triviaSource def parseTrivia(self): trivia = [] soup = BeautifulSoup(self.getTrivia()) triviaList = [] for i in soup('ul', {'class': "trivia"}): for t in i('li'): t = str(t).replace('
', '').strip() if t.startswith('
') and t.endswith('
'): t = t[4:-5].strip() trivia.append(t) self.trivia = trivia return self.trivia def getConnections(self, forcereload = False): if forcereload or not self.connectionsSource: self.connectionsSource = getUrlUnicode(self.connectionsUrl) return self.connectionsSource def parseConnections(self): connections = {} soup = BeautifulSoup(self.getConnections()) content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
')[1:] for c in blocks: connection = c.split('
')[0] cs = BeautifulSoup(c) if connection: #relation -> list of imdb ids connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] return connections def getReleaseinfo(self, forcereload = False): if forcereload or not self.releaseinfoSource: self.releaseinfoSource = getUrlUnicode(self.releaseinfoUrl) return self.releaseinfoSource def parseReleaseinfo(self): soup = BeautifulSoup(self.getReleaseinfo()) info = soup('table',{'border': '0', 'cellpadding':'2'}) if info: for row in info[0]('tr'): d = row('td', {'align':'right'}) if d: try: possible_date = stripTags(str(d[0])).strip() rdate = time.strptime(possible_date, "%d %B %Y") rdate = time.strftime('%Y-%m-%d', rdate) return rdate except: pass return None def getBusiness(self, forcereload = False): if forcereload or not self.businessSource: self.businessSource = getUrlUnicode(self.businessUrl) return self.businessSource def parseBusiness(self): soup = BeautifulSoup(self.getBusiness()) business = {'budget': 0, 'gross': 0, 'profit': 0} content = soup('div', {'id': 'tn15content'})[0] blocks = str(content).split('
')[1:] for c in blocks: cs = BeautifulSoup(c) line = c.split('
') if line: title = line[0] line = line[1] if title in ['Budget', 'Gross']: values = re.compile('\$(.*?) ').findall(line) values = [int(value.replace(',','')) for value in values] if values: business[title.lower()] = max(values) if business['budget'] and business['gross']: business['profit'] = business['gross'] - business['budget'] return business def getExternalreviews(self, forcereload = False): if forcereload or not self.externalreviewsSource: self.externalreviewsSource = getUrlUnicode(self.externalreviewsUrl) return self.externalreviewsSource def parseExternalreviews(self): soup = BeautifulSoup(self.getExternalreviews()) ol = soup('ol') if ol: ol = ol[0] ret = {} for li in ol('li'): try: a = li('a')[0] href = a.get('href') txt = a.contents[0] ret[href] = txt except: pass return ret return {} def getTrailer(self, forcereload = False): if forcereload or not self.trailerSource: self.trailerSource = getUrlUnicode(self.trailerUrl) return self.trailerSource def parseTrailer(self): ret = {} soup = BeautifulSoup(self.getTrailer()) for p in soup('p'): if p('a') and p.firstText(): a = p('a')[0] href = a['href'] if href and href.startswith('http'): title = a.string title = title.replace('www.', '') ret[href] = title return ret def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0] title = title.split('(')[0] title = title.split('.')[0] title = title.strip() imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) return_url = '' #lest first try google #i.e. site:imdb.com Michael Stevens Sin if director: search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(search, 2): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] try: req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS) u = urllib2.urlopen(req) data = u.read() return_url = u.url u.close() except: return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?
.*? int(s[1]): stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (id, s[2])) if not stills: s_ = re.compile(''' int(s[1]): stills.append("http://%sf.jpg" % s[2]) return stills if __name__ == '__main__': import sys #print parse(sys.argv[1]) print "imdb:", guess(sys.argv[1])