more raw values from imdb

2008-04-30 00:15:28 +02:00 · 2008-04-30 00:15:28 +02:00 · 8b58b4824b
commit 8b58b4824b
parent abf263af13
1 changed files with 105 additions and 51 deletions
--- a/ox/imdb.py
+++ b/ox/imdb.py
@ -43,32 +43,39 @@ def getRawMovieData(imdbId):
  data['credits'] = getCredits(imdbId)
  data['poster'] = getPoster(imdbId)
  data['trailers'] = getMovieTrailers(imdbId)
  data['companyCredits'] = getMovieCompanyCredits(imdbId)
 def parseBase(imdbId):
  data = getUrl(getUrlBase(imdbId))
  soup = BeautifulSoup(data)
  info = dict()
-  for i in soup('div', {'class':'info'}):
+  info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
-    title = i('h5')
+  for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
-    if title:
+    title = stripTags(i[0]).strip().lower()
-      title=title[0]
+    txt= stripTags(i[1]).strip()
-      txt = title.findNext()
+    def cleanUp(k):
-      title = stripTags(unicode(title))
+      k = htmldecode(k).replace(u'\xa0', ' ').strip()
-      if title.endswith(':'):
+      if k.endswith('more'): k=k[:-len('more')].strip()
-        title = title[:-1]
+      return k
-      info[title] = htmldecode(stripTags(unicode(txt)))
+    txt = cleanUp(txt)
-  return info
+    if title not in ('plot', 'trivia', 'filming locations', 'mpaa'):
-  return soup
+      if '|' in txt:
        txt = [cleanUp(k) for k in txt.split('|')]
      elif ', ' in txt:
        txt = [cleanUp(k) for k in txt.split(', ')]
    if not title.startswith('moviemeter'):
      info[title] = txt
  for key in ('user comments', 'writers (wga)'):
   if key in info:
    del info[key]
  if 'release date' in info:
    info['release date'] = info['release date'].split('\n')[0]
  if 'plot' in info:
    info['plot'] = info['plot'].split('| add synopsis')[0].strip()
-def getPoster(imdbId):
+  #get Title
  data = getUrl(getUrlBase(imdbId))
  return findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
 def getTitle(imdbId):
  title = ''
  data = getUrl(getUrlBase(imdbId))
  soup = BeautifulSoup(data)
  html_title = soup('div', {'id': 'tn15title'})
  if not html_title:
    html_title = soup('title')
@ -83,10 +90,19 @@ def getTitle(imdbId):
      title = title.replace(t, '')
  title = title.strip()
  if title.find(u'\xa0') > -1:
-    title = title[:title.find(u'\xa0')]
+    title = title[:title.find(u'\xa0')].strip()
  if title.startswith('"') and title.endswith('"'):
    title = title[1:-1]
-  return title
+  info['title'] = title
  return info
 def getPoster(imdbId):
  info = parseBase(imdbId)
  return info['poster']
 def getTitle(imdbId):
  info = parseBase(imdbId)
  return info['title']
 def creditList(data, section=None):
  if section == 'cast':
@ -130,21 +146,75 @@ def getMovieTrailers(imdbId):
      trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
  return trailers
-def getMovieStills(imdbId):
+def getMovieQuotes(imdbId):
-  url = "http://www.imdb.com/gallery/ss/%s" % imdbId
+  url = "%s/quotes" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
-  s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
+  quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(data)
-  stills = []
+  quotes = [(q[0].strip(),q[1].strip())  for q in quotes]
-  for s in s_:
+  return quotes
-    if int(s[0]) > int(s[1]):
+
-      stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
+def getMovieTechnical(imdbId):
-  if not stills:
+  url = "%s/technical" % getUrlBase(imdbId)
-    s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
+  data = getUrlUnicode(url)
-    stills = []
+  results = {}
-    for s in s_:
+  for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
-      if int(s[0]) > int(s[1]):
+    results[t[0].strip()] = t[1].strip()
-        stills.append("http://%sf.jpg" % s[2])
+  return results
-  return stills
+
 def getMovieCompanyCredits(imdbId):
  url = "%s/companycredits" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  results = {}
  for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
    results[field.strip()] = []
    for company in re.compile('<li>(.*?)</li>').findall(c):
      results[field.strip()].append(company)
  return results
 def getMovieLocations(imdbId):
  url = "%s/locations" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  locations = []
  for key in soup('a', {'href': re.compile('^/List')}):
    locations.append(htmldecode(key.string))
  return locations
 def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
  photos = {}
  for key in keys:
    url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
    data = getUrlUnicode(url)
    photos[key] = {}
    for s in  re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
      img= "%s.jpg" % s[1].split('._V')[0]
      title = s[0]
      if key=='still_frame':
        if not "_CR0" in s[1]:
          photos[key][img] = title
      else:
        photos[key][img] = title
  return photos
 def getMovieStills(imdbId):
  return getMovieImages(imdbId, ['still_frame'])['still_frame']
 def getMoviePosters(imdbId):
  return getMovieImages(imdbId, ['poster'])['poster']
 def getMovieTrivia(imdbId):
  url = "%s/trivia" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  trivia = []
  triviaList = []
  for i in  soup('ul', {'class': "trivia"}):
    for t in i('li'):
      t = str(t).replace('<br />', '').strip()
      if t.startswith('<li>') and t.endswith('</li>'):
        t = t[4:-5].strip()
      trivia.append(t)
  return trivia
 '''the old code below'''
@ -160,7 +230,6 @@ class IMDb:
    self.keywordUrl = "%skeywords" % self.pageUrl
    self.plotUrl = "%splotsummary" % self.pageUrl
    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
    self.triviaUrl = "%strivia" % self.pageUrl
    self.locationUrl = "%slocations" % self.pageUrl
    self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
@ -306,7 +375,7 @@ class IMDb:
    IMDbDict['plot'] = self.parsePlot()
    IMDbDict['keywords'] = self.parseKeywords()
-    IMDbDict['trivia'] = self.parseTrivia()
+    IMDbDict['trivia'] = getMovieTrivia(self.imdb)
    IMDbDict['connections'] = self.parseConnections()
    IMDbDict['locations'] = self.parseLocations()
    IMDbDict['release_date'] = self.parseReleaseinfo()
@ -403,21 +472,6 @@ class IMDb:
    self.keywords = keywords
    return self.keywords
  def parseTrivia(self):
    data = getUrlUnicode(self.triviaUrl)
    soup = BeautifulSoup(data)
    trivia = []
    triviaList = []
    for i in  soup('ul', {'class': "trivia"}):
      for t in i('li'):
        t = str(t).replace('<br />', '').strip()
        if t.startswith('<li>') and t.endswith('</li>'):
          t = t[4:-5].strip()
        trivia.append(t)
    self.trivia = trivia
    return self.trivia
  def getConnections(self):
    return getUrlUnicode(self.connectionsUrl)