move plot parsing, remove use of BeautifulSoup from getMovieInfo
This commit is contained in:
parent
9dcbee25c7
commit
24d7378432
1 changed files with 11 additions and 18 deletions
29
ox/imdb.py
29
ox/imdb.py
|
@ -56,11 +56,11 @@ def getRawMovieData(imdbId):
|
|||
data['media'] = {}
|
||||
data['media']['images'] = getMovieImages(imdbId)
|
||||
data['media']['trailers'] = getMovieTrailers(imdbId)
|
||||
data['plotsummary'] = getMoviePlot(imdbId)
|
||||
return data
|
||||
|
||||
def getMovieInfo(imdbId):
|
||||
data = getUrlUnicode(getUrlBase(imdbId))
|
||||
soup = BeautifulSoup(data)
|
||||
info = dict()
|
||||
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||
if info['poster'] and '_V' in info['poster']:
|
||||
|
@ -92,11 +92,11 @@ def getMovieInfo(imdbId):
|
|||
#get Title
|
||||
title = ''
|
||||
year = ''
|
||||
html_title = soup('div', {'id': 'tn15title'})
|
||||
html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
|
||||
print html_title
|
||||
if not html_title:
|
||||
html_title = soup('title')
|
||||
html_title = findRe(data, '<title>(.*?)</title>')
|
||||
if html_title:
|
||||
html_title = unicode(html_title[0])
|
||||
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
||||
title = decodeHtml(html_title)
|
||||
title = stripTags(title)
|
||||
|
@ -192,6 +192,12 @@ def getMovieQuotes(imdbId):
|
|||
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
|
||||
return quotes
|
||||
|
||||
def getMoviePlot(imdbId):
|
||||
url = "%s/plotsummary" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
plot = findRe(data, '<p class="plotpar">(.*?)<i>')
|
||||
return plot
|
||||
|
||||
def getMovieTechnical(imdbId):
|
||||
url = "%s/technical" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
|
@ -316,7 +322,6 @@ class IMDb:
|
|||
self.businessUrl = "%sbusiness" % self.pageUrl
|
||||
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
||||
self.episodesUrl = "%sepisodes" % self.pageUrl
|
||||
self.plotUrl = "%splotsummary" % self.pageUrl
|
||||
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
||||
|
||||
def getPage(self):
|
||||
|
@ -461,7 +466,7 @@ class IMDb:
|
|||
else:
|
||||
IMDbDict['tvshow'] = False
|
||||
IMDbDict['credits'] = self.getCredits()
|
||||
IMDbDict['plot'] = self.parsePlot()
|
||||
IMDbDict['plot'] = getMoviePlot(self.imdb)
|
||||
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
|
||||
|
||||
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
||||
|
@ -496,18 +501,6 @@ class IMDb:
|
|||
self.credits = credits
|
||||
return self.credits
|
||||
|
||||
def parsePlot(self):
|
||||
data = getUrlUnicode(self.plotUrl)
|
||||
soup = BeautifulSoup(data)
|
||||
plot = soup('p', {'class':'plotpar'})
|
||||
if plot:
|
||||
plot = unicode(plot[0]).split('<i>')[0]
|
||||
else:
|
||||
plot = u''
|
||||
plot = stripTags(plot).strip()
|
||||
self.plot = plot
|
||||
return plot
|
||||
|
||||
def parseEpisodes(self):
|
||||
episodes = {}
|
||||
data = getUrlUnicode(self.episodesUrl)
|
||||
|
|
Loading…
Reference in a new issue