move plot parsing, remove use of BeautifulSoup from getMovieInfo

2008-05-23 13:08:40 +02:00 · 2008-05-23 13:08:40 +02:00 · 24d7378432
commit 24d7378432
parent 9dcbee25c7
1 changed files with 11 additions and 18 deletions
--- a/ox/imdb.py
+++ b/ox/imdb.py
@ -56,11 +56,11 @@ def getRawMovieData(imdbId):
  data['media'] = {}
  data['media']['images'] = getMovieImages(imdbId)
  data['media']['trailers'] = getMovieTrailers(imdbId)
+  data['plotsummary'] = getMoviePlot(imdbId)
  return data

 def getMovieInfo(imdbId):
  data = getUrlUnicode(getUrlBase(imdbId))
-  soup = BeautifulSoup(data)
  info = dict()
  info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
  if info['poster'] and '_V' in info['poster']:
@ -92,11 +92,11 @@ def getMovieInfo(imdbId):
  #get Title
  title = ''
  year = ''
-  html_title = soup('div', {'id': 'tn15title'})
+  html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
+  print html_title
  if not html_title:
-    html_title = soup('title')
+    html_title = findRe(data, '<title>(.*?)</title>')
  if html_title:
-    html_title = unicode(html_title[0])
    html_title = html_title.replace('<br />', ' ').replace('  ', ' ')
    title = decodeHtml(html_title)
    title = stripTags(title)
@ -192,6 +192,12 @@ def getMovieQuotes(imdbId):
  quotes = [(q[0].strip(),q[1].strip())  for q in quotes]
  return quotes

+def getMoviePlot(imdbId):
+  url = "%s/plotsummary" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  plot = findRe(data, '<p class="plotpar">(.*?)<i>')
+  return plot
+
 def getMovieTechnical(imdbId):
  url = "%s/technical" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
@ -316,7 +322,6 @@ class IMDb:
    self.businessUrl = "%sbusiness" % self.pageUrl
    self.creditsUrl = "%sfullcredits" % self.pageUrl
    self.episodesUrl = "%sepisodes" % self.pageUrl
-    self.plotUrl = "%splotsummary" % self.pageUrl
    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl

  def getPage(self):
@ -461,7 +466,7 @@ class IMDb:
    else:
      IMDbDict['tvshow'] = False
    IMDbDict['credits'] = self.getCredits()
-    IMDbDict['plot'] = self.parsePlot()
+    IMDbDict['plot'] = getMoviePlot(self.imdb)
    IMDbDict['keywords'] = getMovieKeywords(self.imdb)

    IMDbDict['trivia'] = getMovieTrivia(self.imdb)
@ -496,18 +501,6 @@ class IMDb:
    self.credits = credits
    return self.credits

-  def parsePlot(self):
-    data = getUrlUnicode(self.plotUrl)
-    soup = BeautifulSoup(data)
-    plot = soup('p', {'class':'plotpar'})
-    if plot:
-      plot = unicode(plot[0]).split('<i>')[0]
-    else:
-      plot = u''
-    plot = stripTags(plot).strip()
-    self.plot = plot
-    return plot
-
  def parseEpisodes(self):
    episodes = {}
    data = getUrlUnicode(self.episodesUrl)