From 69adaeee00a7a6b4961a6eaea336a2377d9285fa Mon Sep 17 00:00:00 2001 From: j Date: Tue, 29 Apr 2008 18:12:27 +0200 Subject: [PATCH] - changes to imdb.py * user more oxutils functions * start migrating to a raw dict, first part, parse full cast with names from imdb * add getMovieId --- ox/__init__.py | 4 +- ox/google.py | 1 - ox/imdb.py | 309 ++++++++++++++++++++++++++----------------------- 3 files changed, 165 insertions(+), 149 deletions(-) diff --git a/ox/__init__.py b/ox/__init__.py index e3c36ff..4e1b167 100644 --- a/ox/__init__.py +++ b/ox/__init__.py @@ -4,5 +4,7 @@ __version__ = '0.1.0' -from net import * +import imdb +import wikipedia +import google diff --git a/ox/google.py b/ox/google.py index cf08686..d8400a2 100644 --- a/ox/google.py +++ b/ox/google.py @@ -17,7 +17,6 @@ from oxutils import stripTags usage: import google google.find(query) - for result in google.find(query): result diff --git a/ox/imdb.py b/ox/imdb.py index 8be0e6f..bcb77e1 100644 --- a/ox/imdb.py +++ b/ox/imdb.py @@ -12,19 +12,102 @@ import time from BeautifulSoup import BeautifulSoup import chardet import oxutils -from oxutils import stripTags, htmldecode +from oxutils import stripTags, htmldecode, findRegexp from oxutils.cache import getUrl, getUrlUnicode -from oxutils.normalize import normalizeTitle +from oxutils.normalize import normalizeTitle, normalizeImdbId import google -def _get_data(url): - data = None - try: - data = getUrl(url) - except: - print "error reading data from", url - return data +def getMovieId(title, director='', year=''): + if year: + title = "%s (%s)" % (title, year) + if director: + query = 'site:imdb.com %s "%s"' % (director, title) + else: + query = 'site:imdb.com "%s"' % title + for (name, url, desc) in google.find(query, 3): + if url.startswith('http://www.imdb.com/title/tt'): + return url[28:35] + +def getMovieData(imdbId): + return IMDb(imdbId).parse() + +# internal functions below +def getUrlBase(imdbId): + return "http://www.imdb.com/title/tt%s" % imdbId + +def getRawMovieData(imdbId): + imdbId = normalizeImdbId(imdbId) + data = dict() + data['credits'] = parseCredits(imdbId) + data['poster'] = findRegexp(data, 'name="poster".*?', ' ').replace(' ', ' ') + title = htmldecode(html_title) + title = stripTags(title) + title = re.sub('\(\d\d\d\d\)', '', title) + title = re.sub('\(\d\d\d\d/I*\)', '', title) + for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): + title = title.replace(t, '') + title = title.strip() + if title.find(u'\xa0') > -1: + title = title[:title.find(u'\xa0')] + if title.startswith('"') and title.endswith('"'): + title = title[1:-1] + return title + +def creditList(data, section=None): + if section == 'cast': + credits_ = re.compile('''(.*?).*?(.*?)''').findall(data) + else: + credits_ = re.compile('''.*?(.*?)(.*?)''').findall(data) + credits = [] + for c_ in credits_: + c = [c_[0].strip(), c_[1].strip()] + if section=='writers': + c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','') + if c[1].endswith(' and'): c[1] = c[1][:-4] + credits.append(c) + return credits + +def parseCredits(imdbId): + credits = dict() + url = "%s/fullcredits" % getUrlBase(imdbId) + data = getUrlUnicode(url) + groups = data.split('
') + for g in groups: + section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g) + if section: + credits[section[0]] = creditList(g, section[0]) + return credits + +'''the old code below''' def get_image(url): return getUrl(url) @@ -42,62 +125,36 @@ def _castList(data, regexp): return names return [] -def _getTerm(data, regexp): - term = '' - try: - reg = re.compile(regexp, re.IGNORECASE) - m = reg.search(data) - if m: - term = stripTags(m.group(1)).strip() - except: - print "waring, parsing failed for", regexp - return term.encode('utf8') - - class IMDb: - def __init__(self, imdb): - self.imdb = imdb - self.pageSource = None + def __init__(self, imdbId): + self.imdb = imdbId self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb - self.businessSource = None self.businessUrl = "%sbusiness" % self.pageUrl - self.connectionsSource = None self.connectionsUrl = "%smovieconnections" % self.pageUrl - self.creditsSource = None self.creditsUrl = "%sfullcredits" % self.pageUrl - self.episodesSource = None self.episodesUrl = "%sepisodes" % self.pageUrl - self.keywordSource = None self.keywordUrl = "%skeywords" % self.pageUrl - self.plotSource = None self.plotUrl = "%splotsummary" % self.pageUrl - self.releaseinfoSource = None self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl - self.triviaSource = None self.triviaUrl = "%strivia" % self.pageUrl - self.locationSource = None self.locationUrl = "%slocations" % self.pageUrl - self.externalreviewsSource = None self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl - self.trailerSource = None self.trailerUrl = "%strailers" % self.pageUrl - - def getPage(self, forcereload = False): - if forcereload or not self.pageSource: - self.pageSource = getUrlUnicode(self.pageUrl) - return self.pageSource + + def getPage(self): + return getUrlUnicode(self.pageUrl) def parse_raw_value(self, key, value): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): value = unicode(value, 'utf-8') - value = stripTags(value).strip() + value = stripTags(value).strip() if key == 'runtime': - parsed_value = _getTerm(value, '(.*?) min') - parsed_value = _getTerm(parsed_value, '([0-9]+)') + parsed_value = findRegexp(value, '(.*?) min') + parsed_value = findRegexp(parsed_value, '([0-9]+)') if not parsed_value: - parsed_value = _getTerm(value, '(.*?) sec') - parsed_value = _getTerm(parsed_value, '([0-9]+)') + parsed_value = findRegexp(value, '(.*?) sec') + parsed_value = findRegexp(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: @@ -141,7 +198,7 @@ class IMDb: print value parsed_value = value return parsed_value - + def parseTitle(self): title = '' data = self.getPage() @@ -153,8 +210,8 @@ class IMDb: html_title = str(html_title[0]) html_title = html_title.replace('
', ' ').replace(' ', ' ') title = stripTags(html_title) - title = re.sub('\(\d\d\d\d\)', '', title) - title = re.sub('\(\d\d\d\d/I*\)', '', title) + title = re.sub('\(\d{4}\)', '', title) + title = re.sub('\(\d{4}/I*\)', '', title) for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') if title.find(u'\xa0') > -1: @@ -172,7 +229,7 @@ class IMDb: else: title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] return normalizeTitle(title) - + def parseYear(self): year = '' data = self.getPage() @@ -183,25 +240,25 @@ class IMDb: if html_title: html_title = str(html_title[0]) html_title = stripTags(html_title) - year = re.compile('\((\d\d\d\d)\)').findall(html_title) + year = re.compile('\((\d{4})\)').findall(html_title) if not year: - year = re.compile('\((\d\d\d\d)/').findall(html_title) - if year: + year = re.compile('\((\d{4})/').findall(html_title) + if year: year = year[0] else: year = '' return year - + def parse(self): data = self.getPage() IMDbDict ={} #Poster - IMDbDict['poster'] = _getTerm(data, 'name="poster".*?(.*?)/10', re.IGNORECASE).search(data) if m: @@ -251,49 +308,32 @@ class IMDb: IMDbDict['stills'] = getMovieStills(self.imdb) #IMDbDict['trailer'] = self.parseTrailer() self.IMDbDict = IMDbDict - + if IMDbDict['episode_of']: episode_of =IMDb(IMDbDict['episode_of']).parse() for key in ('country', 'language'): if not IMDbDict[key]: IMDbDict[key] = episode_of[key] return self.IMDbDict - - def getCredits(self, forcereload = False): - if forcereload or not self.creditsSource: - self.creditsSource = getUrlUnicode(self.creditsUrl) - return self.creditsSource - + def parseCredits(self): - data = self.getCredits() + raw_credits = parseCredits(self.imdb) credits = {} - credits['director'] = _castList(data, 'Directed by.*?(.*?)') - credits['writer'] = _castList(data, 'Writing credits.*?(.*?)') - credits['producer'] = _castList(data, 'Produced by.*?(.*?)') - #credits['cast'] = _castList(data, 'Cast.*?(') - credits['cast'] = [] - soup = re.compile('Cast.*?(').findall(data) - soup = BeautifulSoup(data) - cast = soup('table', {'class': 'cast'}) - if cast: - cast = str(cast[0]).replace(u'\xa0', ' ') - names = re.compile('(.*?).*?(.*?)').findall(cast) - for name in names: - real_name = name[0] - role_name = name[1] - if role_name: - role_name = role_name.split('(')[0].replace('/ ...','') - credits['cast'].append((stripTags(real_name), stripTags(role_name))) + + def getNames(creditList): + return [stripTags(c[0]) for c in creditList] + + credits['director'] = getNames(raw_credits['directors']) + credits['writer'] = getNames(raw_credits['writers']) + credits['producer'] = getNames(raw_credits['producers']) + credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']] + self.credits = credits return self.credits - - def getPlot(self, forcereload = False): - if forcereload or not self.plotSource: - self.plotSource = getUrlUnicode(self.plotUrl) - return self.plotSource def parsePlot(self): - soup = BeautifulSoup(self.getPlot()) + data = getUrlUnicode(self.plotUrl) + soup = BeautifulSoup(data) plot = soup('p', {'class':'plotpar'}) if plot: plot = unicode(plot[0]).split('')[0] @@ -302,15 +342,11 @@ class IMDb: plot = stripTags(plot).strip() self.plot = plot return plot - - def getEpisodes(self, forcereload = False): - if forcereload or not self.episodesSource: - self.episodesSource = getUrlUnicode(self.episodesUrl) - return self.episodesSource - + def parseEpisodes(self): episodes = {} - cdata = self.getEpisodes().replace('\r\n', ' ') + data = getUrlUnicode(self.episodesUrl) + cdata = data.replace('\r\n', ' ') regexp = r'''

Season (.*?), Episode (.*?): (.*?)

(.*?)
(.*?)
''' reg = re.compile(regexp, re.IGNORECASE) m = reg.findall(cdata) @@ -340,26 +376,18 @@ class IMDb: self.episodes = episodes return self.episodes - def getLocations(self, forcereload = False): - if forcereload or not self.locationSource: - self.keywordSource = getUrlUnicode(self.locationUrl) - return self.keywordSource - def parseLocations(self): - soup = BeautifulSoup(self.getLocations()) + data = getUrlUnicode(self.locationUrl) + soup = BeautifulSoup(data) locations = [] for key in soup('a', {'href': re.compile('^/List')}): locations.append(htmldecode(key.string)) self.locations = locations return self.locations - - def getKeywords(self, forcereload = False): - if forcereload or not self.keywordSource: - self.keywordSource = getUrlUnicode(self.keywordUrl) - return self.keywordSource def parseKeywords(self): - soup = BeautifulSoup(self.getKeywords()) + data = getUrlUnicode(self.keywordUrl) + soup = BeautifulSoup(data) keywords = [] for key in soup('a', {'href': re.compile('^/keyword/')}): k = htmldecode(key.string) @@ -368,28 +396,23 @@ class IMDb: self.keywords = keywords return self.keywords - def getTrivia(self, forcereload = False): - if forcereload or not self.triviaSource: - self.triviaSource = getUrlUnicode(self.triviaUrl) - return self.triviaSource - def parseTrivia(self): + data = getUrlUnicode(self.triviaUrl) + soup = BeautifulSoup(data) + trivia = [] - soup = BeautifulSoup(self.getTrivia()) triviaList = [] for i in soup('ul', {'class': "trivia"}): for t in i('li'): t = str(t).replace('
', '').strip() if t.startswith('
  • ') and t.endswith('
  • '): - t = t[4:-5].strip() + t = t[4:-5].strip() trivia.append(t) self.trivia = trivia return self.trivia - - def getConnections(self, forcereload = False): - if forcereload or not self.connectionsSource: - self.connectionsSource = getUrlUnicode(self.connectionsUrl) - return self.connectionsSource + + def getConnections(self): + return getUrlUnicode(self.connectionsUrl) def parseConnections(self): connections = {} @@ -404,10 +427,8 @@ class IMDb: connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] return connections - def getReleaseinfo(self, forcereload = False): - if forcereload or not self.releaseinfoSource: - self.releaseinfoSource = getUrlUnicode(self.releaseinfoUrl) - return self.releaseinfoSource + def getReleaseinfo(self): + return getUrlUnicode(self.releaseinfoUrl) def parseReleaseinfo(self): soup = BeautifulSoup(self.getReleaseinfo()) @@ -424,12 +445,10 @@ class IMDb: except: pass return None - - def getBusiness(self, forcereload = False): - if forcereload or not self.businessSource: - self.businessSource = getUrlUnicode(self.businessUrl) - return self.businessSource - + + def getBusiness(self): + return getUrlUnicode(self.businessUrl) + def parseBusiness(self): soup = BeautifulSoup(self.getBusiness()) business = {'budget': 0, 'gross': 0, 'profit': 0} @@ -449,12 +468,10 @@ class IMDb: if business['budget'] and business['gross']: business['profit'] = business['gross'] - business['budget'] return business - - def getExternalreviews(self, forcereload = False): - if forcereload or not self.externalreviewsSource: - self.externalreviewsSource = getUrlUnicode(self.externalreviewsUrl) - return self.externalreviewsSource - + + def getExternalreviews(self): + return getUrlUnicode(self.externalreviewsUrl) + def parseExternalreviews(self): soup = BeautifulSoup(self.getExternalreviews()) ol = soup('ol') @@ -471,12 +488,10 @@ class IMDb: pass return ret return {} - - def getTrailer(self, forcereload = False): - if forcereload or not self.trailerSource: - self.trailerSource = getUrlUnicode(self.trailerUrl) - return self.trailerSource - + + def getTrailer(self): + return getUrlUnicode(self.trailerUrl) + def parseTrailer(self): ret = {} soup = BeautifulSoup(self.getTrailer()) @@ -519,8 +534,8 @@ def guess(title, director=''): return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] - if data: - imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?
    1. .*? int(s[1]): - stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (id, s[2])) + stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2])) if not stills: s_ = re.compile('''