From 8b58b4824bc4fe9002bcd586a14e3333b9bd4af3 Mon Sep 17 00:00:00 2001 From: j Date: Wed, 30 Apr 2008 00:15:28 +0200 Subject: [PATCH] more raw values from imdb --- ox/imdb.py | 156 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 105 insertions(+), 51 deletions(-) diff --git a/ox/imdb.py b/ox/imdb.py index 8fc9ade..d957e4d 100644 --- a/ox/imdb.py +++ b/ox/imdb.py @@ -43,32 +43,39 @@ def getRawMovieData(imdbId): data['credits'] = getCredits(imdbId) data['poster'] = getPoster(imdbId) data['trailers'] = getMovieTrailers(imdbId) + data['companyCredits'] = getMovieCompanyCredits(imdbId) def parseBase(imdbId): data = getUrl(getUrlBase(imdbId)) soup = BeautifulSoup(data) info = dict() - for i in soup('div', {'class':'info'}): + info['poster'] = findRegexp(data, 'name="poster".*?(.*?):(.*?)
-1: - title = title[:title.find(u'\xa0')] + title = title[:title.find(u'\xa0')].strip() if title.startswith('"') and title.endswith('"'): title = title[1:-1] - return title + info['title'] = title + return info + +def getPoster(imdbId): + info = parseBase(imdbId) + return info['poster'] + +def getTitle(imdbId): + info = parseBase(imdbId) + return info['title'] def creditList(data, section=None): if section == 'cast': @@ -130,21 +146,75 @@ def getMovieTrailers(imdbId): trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) return trailers -def getMovieStills(imdbId): - url = "http://www.imdb.com/gallery/ss/%s" % imdbId +def getMovieQuotes(imdbId): + url = "%s/quotes" % getUrlBase(imdbId) data = getUrlUnicode(url) - s_ = re.compile(''' int(s[1]): - stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2])) - if not stills: - s_ = re.compile(''' int(s[1]): - stills.append("http://%sf.jpg" % s[2]) - return stills + quotes = re.compile('(.*?):(.*?)
', re.DOTALL).findall(data) + quotes = [(q[0].strip(),q[1].strip()) for q in quotes] + return quotes + +def getMovieTechnical(imdbId): + url = "%s/technical" % getUrlBase(imdbId) + data = getUrlUnicode(url) + results = {} + for t in re.compile('
(.*?)
(.*?)
', re.DOTALL).findall(data): + results[t[0].strip()] = t[1].strip() + return results + +def getMovieCompanyCredits(imdbId): + url = "%s/companycredits" % getUrlBase(imdbId) + data = getUrlUnicode(url) + results = {} + for field, c in re.compile('

(.*?)

').findall(data): + results[field.strip()] = [] + for company in re.compile('
  • (.*?)
  • ').findall(c): + results[field.strip()].append(company) + return results + +def getMovieLocations(imdbId): + url = "%s/locations" % getUrlBase(imdbId) + data = getUrlUnicode(url) + soup = BeautifulSoup(data) + locations = [] + for key in soup('a', {'href': re.compile('^/List')}): + locations.append(htmldecode(key.string)) + return locations + +def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')): + photos = {} + for key in keys: + url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key) + data = getUrlUnicode(url) + photos[key] = {} + for s in re.compile('''(.*?)', '').strip() + if t.startswith('
  • ') and t.endswith('
  • '): + t = t[4:-5].strip() + trivia.append(t) + return trivia '''the old code below''' @@ -160,7 +230,6 @@ class IMDb: self.keywordUrl = "%skeywords" % self.pageUrl self.plotUrl = "%splotsummary" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl - self.triviaUrl = "%strivia" % self.pageUrl self.locationUrl = "%slocations" % self.pageUrl self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl @@ -306,7 +375,7 @@ class IMDb: IMDbDict['plot'] = self.parsePlot() IMDbDict['keywords'] = self.parseKeywords() - IMDbDict['trivia'] = self.parseTrivia() + IMDbDict['trivia'] = getMovieTrivia(self.imdb) IMDbDict['connections'] = self.parseConnections() IMDbDict['locations'] = self.parseLocations() IMDbDict['release_date'] = self.parseReleaseinfo() @@ -403,21 +472,6 @@ class IMDb: self.keywords = keywords return self.keywords - def parseTrivia(self): - data = getUrlUnicode(self.triviaUrl) - soup = BeautifulSoup(data) - - trivia = [] - triviaList = [] - for i in soup('ul', {'class': "trivia"}): - for t in i('li'): - t = str(t).replace('
    ', '').strip() - if t.startswith('
  • ') and t.endswith('
  • '): - t = t[4:-5].strip() - trivia.append(t) - self.trivia = trivia - return self.trivia - def getConnections(self): return getUrlUnicode(self.connectionsUrl)