move plot parsing, remove use of BeautifulSoup from getMovieInfo

This commit is contained in:
j 2008-05-23 13:08:40 +02:00
parent 9dcbee25c7
commit 24d7378432

View file

@ -56,11 +56,11 @@ def getRawMovieData(imdbId):
data['media'] = {} data['media'] = {}
data['media']['images'] = getMovieImages(imdbId) data['media']['images'] = getMovieImages(imdbId)
data['media']['trailers'] = getMovieTrailers(imdbId) data['media']['trailers'] = getMovieTrailers(imdbId)
data['plotsummary'] = getMoviePlot(imdbId)
return data return data
def getMovieInfo(imdbId): def getMovieInfo(imdbId):
data = getUrlUnicode(getUrlBase(imdbId)) data = getUrlUnicode(getUrlBase(imdbId))
soup = BeautifulSoup(data)
info = dict() info = dict()
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"') info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
if info['poster'] and '_V' in info['poster']: if info['poster'] and '_V' in info['poster']:
@ -92,11 +92,11 @@ def getMovieInfo(imdbId):
#get Title #get Title
title = '' title = ''
year = '' year = ''
html_title = soup('div', {'id': 'tn15title'}) html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
print html_title
if not html_title: if not html_title:
html_title = soup('title') html_title = findRe(data, '<title>(.*?)</title>')
if html_title: if html_title:
html_title = unicode(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ') html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = decodeHtml(html_title) title = decodeHtml(html_title)
title = stripTags(title) title = stripTags(title)
@ -192,6 +192,12 @@ def getMovieQuotes(imdbId):
quotes = [(q[0].strip(),q[1].strip()) for q in quotes] quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
return quotes return quotes
def getMoviePlot(imdbId):
url = "%s/plotsummary" % getUrlBase(imdbId)
data = getUrlUnicode(url)
plot = findRe(data, '<p class="plotpar">(.*?)<i>')
return plot
def getMovieTechnical(imdbId): def getMovieTechnical(imdbId):
url = "%s/technical" % getUrlBase(imdbId) url = "%s/technical" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
@ -316,7 +322,6 @@ class IMDb:
self.businessUrl = "%sbusiness" % self.pageUrl self.businessUrl = "%sbusiness" % self.pageUrl
self.creditsUrl = "%sfullcredits" % self.pageUrl self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesUrl = "%sepisodes" % self.pageUrl self.episodesUrl = "%sepisodes" % self.pageUrl
self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
def getPage(self): def getPage(self):
@ -461,7 +466,7 @@ class IMDb:
else: else:
IMDbDict['tvshow'] = False IMDbDict['tvshow'] = False
IMDbDict['credits'] = self.getCredits() IMDbDict['credits'] = self.getCredits()
IMDbDict['plot'] = self.parsePlot() IMDbDict['plot'] = getMoviePlot(self.imdb)
IMDbDict['keywords'] = getMovieKeywords(self.imdb) IMDbDict['keywords'] = getMovieKeywords(self.imdb)
IMDbDict['trivia'] = getMovieTrivia(self.imdb) IMDbDict['trivia'] = getMovieTrivia(self.imdb)
@ -496,18 +501,6 @@ class IMDb:
self.credits = credits self.credits = credits
return self.credits return self.credits
def parsePlot(self):
data = getUrlUnicode(self.plotUrl)
soup = BeautifulSoup(data)
plot = soup('p', {'class':'plotpar'})
if plot:
plot = unicode(plot[0]).split('<i>')[0]
else:
plot = u''
plot = stripTags(plot).strip()
self.plot = plot
return plot
def parseEpisodes(self): def parseEpisodes(self):
episodes = {} episodes = {}
data = getUrlUnicode(self.episodesUrl) data = getUrlUnicode(self.episodesUrl)