move plot parsing, remove use of BeautifulSoup from getMovieInfo
This commit is contained in:
parent
9dcbee25c7
commit
24d7378432
1 changed files with 11 additions and 18 deletions
29
ox/imdb.py
29
ox/imdb.py
|
@ -56,11 +56,11 @@ def getRawMovieData(imdbId):
|
||||||
data['media'] = {}
|
data['media'] = {}
|
||||||
data['media']['images'] = getMovieImages(imdbId)
|
data['media']['images'] = getMovieImages(imdbId)
|
||||||
data['media']['trailers'] = getMovieTrailers(imdbId)
|
data['media']['trailers'] = getMovieTrailers(imdbId)
|
||||||
|
data['plotsummary'] = getMoviePlot(imdbId)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getMovieInfo(imdbId):
|
def getMovieInfo(imdbId):
|
||||||
data = getUrlUnicode(getUrlBase(imdbId))
|
data = getUrlUnicode(getUrlBase(imdbId))
|
||||||
soup = BeautifulSoup(data)
|
|
||||||
info = dict()
|
info = dict()
|
||||||
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
|
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||||
if info['poster'] and '_V' in info['poster']:
|
if info['poster'] and '_V' in info['poster']:
|
||||||
|
@ -92,11 +92,11 @@ def getMovieInfo(imdbId):
|
||||||
#get Title
|
#get Title
|
||||||
title = ''
|
title = ''
|
||||||
year = ''
|
year = ''
|
||||||
html_title = soup('div', {'id': 'tn15title'})
|
html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
|
||||||
|
print html_title
|
||||||
if not html_title:
|
if not html_title:
|
||||||
html_title = soup('title')
|
html_title = findRe(data, '<title>(.*?)</title>')
|
||||||
if html_title:
|
if html_title:
|
||||||
html_title = unicode(html_title[0])
|
|
||||||
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
||||||
title = decodeHtml(html_title)
|
title = decodeHtml(html_title)
|
||||||
title = stripTags(title)
|
title = stripTags(title)
|
||||||
|
@ -192,6 +192,12 @@ def getMovieQuotes(imdbId):
|
||||||
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
|
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
|
||||||
return quotes
|
return quotes
|
||||||
|
|
||||||
|
def getMoviePlot(imdbId):
|
||||||
|
url = "%s/plotsummary" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
plot = findRe(data, '<p class="plotpar">(.*?)<i>')
|
||||||
|
return plot
|
||||||
|
|
||||||
def getMovieTechnical(imdbId):
|
def getMovieTechnical(imdbId):
|
||||||
url = "%s/technical" % getUrlBase(imdbId)
|
url = "%s/technical" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
|
@ -316,7 +322,6 @@ class IMDb:
|
||||||
self.businessUrl = "%sbusiness" % self.pageUrl
|
self.businessUrl = "%sbusiness" % self.pageUrl
|
||||||
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
||||||
self.episodesUrl = "%sepisodes" % self.pageUrl
|
self.episodesUrl = "%sepisodes" % self.pageUrl
|
||||||
self.plotUrl = "%splotsummary" % self.pageUrl
|
|
||||||
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
||||||
|
|
||||||
def getPage(self):
|
def getPage(self):
|
||||||
|
@ -461,7 +466,7 @@ class IMDb:
|
||||||
else:
|
else:
|
||||||
IMDbDict['tvshow'] = False
|
IMDbDict['tvshow'] = False
|
||||||
IMDbDict['credits'] = self.getCredits()
|
IMDbDict['credits'] = self.getCredits()
|
||||||
IMDbDict['plot'] = self.parsePlot()
|
IMDbDict['plot'] = getMoviePlot(self.imdb)
|
||||||
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
|
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
|
||||||
|
|
||||||
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
||||||
|
@ -496,18 +501,6 @@ class IMDb:
|
||||||
self.credits = credits
|
self.credits = credits
|
||||||
return self.credits
|
return self.credits
|
||||||
|
|
||||||
def parsePlot(self):
|
|
||||||
data = getUrlUnicode(self.plotUrl)
|
|
||||||
soup = BeautifulSoup(data)
|
|
||||||
plot = soup('p', {'class':'plotpar'})
|
|
||||||
if plot:
|
|
||||||
plot = unicode(plot[0]).split('<i>')[0]
|
|
||||||
else:
|
|
||||||
plot = u''
|
|
||||||
plot = stripTags(plot).strip()
|
|
||||||
self.plot = plot
|
|
||||||
return plot
|
|
||||||
|
|
||||||
def parseEpisodes(self):
|
def parseEpisodes(self):
|
||||||
episodes = {}
|
episodes = {}
|
||||||
data = getUrlUnicode(self.episodesUrl)
|
data = getUrlUnicode(self.episodesUrl)
|
||||||
|
|
Loading…
Reference in a new issue