get more things out of IMDb class

2008-05-25 19:29:14 +02:00 · 2008-05-25 19:29:14 +02:00 · 454eefb7cb
commit 454eefb7cb
parent 55c5cdfa99
1 changed files with 133 additions and 86 deletions
--- a/ox/imdb.py
+++ b/ox/imdb.py
@ -57,6 +57,8 @@ def getRawMovieData(imdbId):
  data['media']['images'] = getMovieImages(imdbId)
  data['media']['trailers'] = getMovieTrailers(imdbId)
  data['plotsummary'] = getMoviePlot(imdbId)
+  data['release dates'] = getMovieReleaseDates(imdbId)
+  data['release date'] = getMovieReleaseDate(imdbId)
  return data

 def getMovieInfo(imdbId):
@ -294,7 +296,6 @@ def getMovieKeywords(imdbId):
    keywords.append(k)
  return keywords

-
 def getMovieExternalReviews(imdbId):
  url = "%s/externalreviews" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
@ -314,6 +315,126 @@ def getMovieExternalReviews(imdbId):
    return ret
  return {}

+def getMovieReleaseDate(imdbId):
+  releasedates = getMovieReleaseDates(imdbId)
+  first_release = ''
+  for r in releasedates:
+    if not first_release or r[1] < first_release:
+      first_release = r[1]
+  return first_release
+
+def getMovieReleaseDates(imdbId):
+  url = "%s/releaseinfo" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  releasedates = []
+  regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
+
+  def _parse_date(d):
+    try:
+      parsed_date = time.strptime(d, "%d %B %Y")
+      parsed_date = time.strftime('%Y-%m-%d', parsed_date)
+      return parsed_date
+    except:
+      return d
+
+  for r in re.compile(regexp, re.DOTALL).findall(data):
+    r_ = (stripTags(r[0]).strip(),
+          _parse_date(stripTags(r[1]).strip()),
+          decodeHtml(stripTags(r[2]).strip()))
+    releasedates.append(r_)
+  return releasedates
+  soup = BeautifulSoup(data)
+  info = soup('table',{'border': '0', 'cellpadding':'2'})
+  if info:
+    for row in info[0]('tr'):
+      d = row('td', {'align':'right'})
+      if d:
+        try:
+          possible_date = stripTags(unicode(d[0])).strip()
+          rdate = time.strptime(possible_date, "%d %B %Y")
+          rdate = time.strftime('%Y-%m-%d', rdate)
+          return rdate
+        except:
+          pass
+  return None
+
+def getMovieBusinessSum(imdbId):
+  business = getMovieBusiness(imdbId)
+  b_ = {'budget': 0, 'gross': 0, 'profit': 0}
+  if 'budget' in business:
+    b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
+  if 'gross' in business:
+    b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
+    if 'weekend gross' in business:
+      b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
+  if b_['budget'] and b_['gross']:
+    b_['profit'] = b_['gross'] - b_['budget']
+  return b_
+
+def getMovieFlimingDates(imdbId):
+  business = getMovieBusiness(imdbId)
+  if 'filming dates' in business and business['filming dates']:
+    return business['filming dates'][0]
+  return ''
+
+def getMovieBusiness(imdbId):
+  url = "%s/business" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  business = {}
+  for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
+    key = stripTags(r[0]).strip().lower()
+    value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('<br/>')]
+    business[key] = value
+  return business
+  soup = BeautifulSoup(data)
+  business = {'budget': 0, 'gross': 0, 'profit': 0}
+  content = soup('div', {'id': 'tn15content'})[0]
+  blocks = unicode(content).split('<h5>')[1:]
+  for c in blocks:
+    cs = BeautifulSoup(c)
+    line = c.split('</h5>')
+    if line:
+      title = line[0]
+      line = line[1]
+      if title in ['Budget', 'Gross']:
+        values = re.compile('\$(.*?) ').findall(line)
+        values = [int(value.replace(',','')) for value in values]
+        if values:
+          business[title.lower()] = max(values)
+  if business['budget'] and business['gross']:
+    business['profit'] = business['gross'] - business['budget']
+  return business
+
+def getMovieEpisodes(imdbId):
+  url = "%s/episodes" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  episodes = {}
+  regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
+  for r in re.compile(regexp, re.DOTALL).findall(data):
+    try:
+      episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
+      episodes[episode] = {}
+      episodes[episode]['imdb'] = r[2]
+      episodes[episode]['title'] = r[3].strip()
+      if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
+        episodes[episode]['title'] = u''
+      description = decodeHtml(r[5])
+      description = stripTags(description.split('Next US airings:')[0])
+      episodes[episode]['description'] = description.strip()
+      episodes[episode]['date'] = ''
+      try:
+        d = stripTags(r[4])
+        d = d.replace('Original Air Date: ', '')
+        d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
+        episodes[episode]['date'] = d
+      except:
+        pass
+    except:
+      import traceback
+      print traceback.print_exc()
+      pass
+  return episodes
+
 '''the old code below'''

 class IMDb:
@ -321,11 +442,6 @@ class IMDb:
    self.imdb = imdbId
    self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb

-    self.businessUrl = "%sbusiness" % self.pageUrl
-    self.creditsUrl = "%sfullcredits" % self.pageUrl
-    self.episodesUrl = "%sepisodes" % self.pageUrl
-    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
-
  def getPage(self):
    return getUrlUnicode(self.pageUrl)

@ -393,13 +509,18 @@ class IMDb:
    title = normalizeTitle(title)
    if title.startswith('"') and title.find('"',1) > 0 and \
      title.find('"',1) == title.rfind('"'):
+      data = self.getPage()
      se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
      if se:
        se = se[0]
-        se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
-        title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
+        se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1]))
+        title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip()
      else:
-        title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
+        part2 = title[title.rfind('"')+1:]
+        part2 = re.sub("[\d\?-]", "", part2).strip()
+        title = normalizeTitle(title[1:title.rfind('"')])
+        if part2:
+          title += ':' + part2
    return normalizeTitle(title)

  def parseYear(self):
@ -462,7 +583,7 @@ class IMDb:
    #is episode
    IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')

-    IMDbDict['episodes'] = self.parseEpisodes()
+    IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
    if IMDbDict['episodes']:
      IMDbDict['tvshow'] = True
    else:
@ -474,8 +595,8 @@ class IMDb:
    IMDbDict['trivia'] = getMovieTrivia(self.imdb)
    IMDbDict['connections'] = getMovieConnections(self.imdb)
    IMDbDict['locations'] = getMovieLocations(self.imdb)
-    IMDbDict['release_date'] = self.parseReleaseinfo()
-    IMDbDict['business'] = self.parseBusiness()
+    IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
+    IMDbDict['business'] = getMovieBusinessSum(self.imdb)
    IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
    IMDbDict['stills'] = getMovieStills(self.imdb)
    #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
@ -503,80 +624,6 @@ class IMDb:
    self.credits = credits
    return self.credits

-  def parseEpisodes(self):
-    episodes = {}
-    data = getUrlUnicode(self.episodesUrl)
-    cdata = data.replace('\r\n', ' ')
-    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
-    reg = re.compile(regexp, re.IGNORECASE)
-    m = reg.findall(cdata)
-    for match in m:
-      try:
-        episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
-        episodes[episode] = {}
-        episodes[episode]['imdb'] = match[2]
-        episodes[episode]['title'] = match[3].strip()
-        if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
-          episodes[episode]['title'] = u''
-        description = decodeHtml(match[5])
-        description = stripTags(description.split('Next US airings:')[0])
-        episodes[episode]['description'] = description
-        episodes[episode]['date'] = ''
-        try:
-          d = stripTags(match[4])
-          d = d.replace('Original Air Date: ', '')
-          d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
-          episodes[episode]['date'] = d
-        except:
-          pass
-      except:
-        import traceback
-        print traceback.print_exc()
-        pass
-    self.episodes = episodes
-    return self.episodes
-
-  def getReleaseinfo(self):
-    return getUrlUnicode(self.releaseinfoUrl)
-
-  def parseReleaseinfo(self):
-    soup = BeautifulSoup(self.getReleaseinfo())
-    info = soup('table',{'border': '0', 'cellpadding':'2'})
-    if info:
-      for row in info[0]('tr'):
-        d = row('td', {'align':'right'})
-        if d:
-          try:
-            possible_date = stripTags(unicode(d[0])).strip()
-            rdate = time.strptime(possible_date, "%d %B %Y")
-            rdate = time.strftime('%Y-%m-%d', rdate)
-            return rdate
-          except:
-            pass
-    return None
-
-  def getBusiness(self):
-    return getUrlUnicode(self.businessUrl)
-
-  def parseBusiness(self):
-    soup = BeautifulSoup(self.getBusiness())
-    business = {'budget': 0, 'gross': 0, 'profit': 0}
-    content = soup('div', {'id': 'tn15content'})[0]
-    blocks = unicode(content).split('<h5>')[1:]
-    for c in blocks:
-      cs = BeautifulSoup(c)
-      line = c.split('</h5>')
-      if line:
-        title = line[0]
-        line = line[1]
-        if title in ['Budget', 'Gross']:
-          values = re.compile('\$(.*?) ').findall(line)
-          values = [int(value.replace(',','')) for value in values]
-          if values:
-            business[title.lower()] = max(values)
-    if business['budget'] and business['gross']:
-      business['profit'] = business['gross'] - business['budget']
-    return business

 def guess(title, director=''):
  #FIXME: proper file -> title