more raw values from imdb

2008-04-30 00:15:28 +02:00 · 2008-04-30 00:15:28 +02:00 · 8b58b4824b
commit 8b58b4824b
parent abf263af13
1 changed files with 105 additions and 51 deletions
--- a/ox/imdb.py
+++ b/ox/imdb.py
@ -43,32 +43,39 @@ def getRawMovieData(imdbId):
  data['credits'] = getCredits(imdbId)
  data['poster'] = getPoster(imdbId)
  data['trailers'] = getMovieTrailers(imdbId)
+  data['companyCredits'] = getMovieCompanyCredits(imdbId)

 def parseBase(imdbId):
  data = getUrl(getUrlBase(imdbId))
  soup = BeautifulSoup(data)
  info = dict()
-  for i in soup('div', {'class':'info'}):
+  info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')

-    title = i('h5')
-    if title:
-      title=title[0]
-      txt = title.findNext()
-      title = stripTags(unicode(title))
-      if title.endswith(':'):
-        title = title[:-1]
-      info[title] = htmldecode(stripTags(unicode(txt)))
-  return info
-  return soup
+  for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
+    title = stripTags(i[0]).strip().lower()
+    txt= stripTags(i[1]).strip()
+    def cleanUp(k):
+      k = htmldecode(k).replace(u'\xa0', ' ').strip()
+      if k.endswith('more'): k=k[:-len('more')].strip()
+      return k
+    txt = cleanUp(txt)
+    if title not in ('plot', 'trivia', 'filming locations', 'mpaa'):
+      if '|' in txt:
+        txt = [cleanUp(k) for k in txt.split('|')]
+      elif ', ' in txt:
+        txt = [cleanUp(k) for k in txt.split(', ')]
+    if not title.startswith('moviemeter'):
+      info[title] = txt
+  for key in ('user comments', 'writers (wga)'):
+   if key in info:
+    del info[key]
+  if 'release date' in info:
+    info['release date'] = info['release date'].split('\n')[0]
+  if 'plot' in info:
+    info['plot'] = info['plot'].split('| add synopsis')[0].strip()

-def getPoster(imdbId):
-  data = getUrl(getUrlBase(imdbId))
-  return findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
-
-def getTitle(imdbId):
+  #get Title
  title = ''
-  data = getUrl(getUrlBase(imdbId))
-  soup = BeautifulSoup(data)
  html_title = soup('div', {'id': 'tn15title'})
  if not html_title:
    html_title = soup('title')
@ -83,10 +90,19 @@ def getTitle(imdbId):
      title = title.replace(t, '')
  title = title.strip()
  if title.find(u'\xa0') > -1:
-    title = title[:title.find(u'\xa0')]
+    title = title[:title.find(u'\xa0')].strip()
  if title.startswith('"') and title.endswith('"'):
    title = title[1:-1]
-  return title
+  info['title'] = title
+  return info
+
+def getPoster(imdbId):
+  info = parseBase(imdbId)
+  return info['poster']
+
+def getTitle(imdbId):
+  info = parseBase(imdbId)
+  return info['title']

 def creditList(data, section=None):
  if section == 'cast':
@ -130,21 +146,75 @@ def getMovieTrailers(imdbId):
      trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
  return trailers

-def getMovieStills(imdbId):
-  url = "http://www.imdb.com/gallery/ss/%s" % imdbId
+def getMovieQuotes(imdbId):
+  url = "%s/quotes" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
-  s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
-  stills = []
-  for s in s_:
-    if int(s[0]) > int(s[1]):
-      stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
-  if not stills:
-    s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
-    stills = []
-    for s in s_:
-      if int(s[0]) > int(s[1]):
-        stills.append("http://%sf.jpg" % s[2])
-  return stills
+  quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(data)
+  quotes = [(q[0].strip(),q[1].strip())  for q in quotes]
+  return quotes
+
+def getMovieTechnical(imdbId):
+  url = "%s/technical" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  results = {}
+  for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
+    results[t[0].strip()] = t[1].strip()
+  return results
+
+def getMovieCompanyCredits(imdbId):
+  url = "%s/companycredits" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  results = {}
+  for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
+    results[field.strip()] = []
+    for company in re.compile('<li>(.*?)</li>').findall(c):
+      results[field.strip()].append(company)
+  return results
+
+def getMovieLocations(imdbId):
+  url = "%s/locations" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  soup = BeautifulSoup(data)
+  locations = []
+  for key in soup('a', {'href': re.compile('^/List')}):
+    locations.append(htmldecode(key.string))
+  return locations
+
+def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
+  photos = {}
+  for key in keys:
+    url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
+    data = getUrlUnicode(url)
+    photos[key] = {}
+    for s in  re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
+      img= "%s.jpg" % s[1].split('._V')[0]
+      title = s[0]
+      if key=='still_frame':
+        if not "_CR0" in s[1]:
+          photos[key][img] = title
+      else:
+        photos[key][img] = title
+  return photos
+
+def getMovieStills(imdbId):
+  return getMovieImages(imdbId, ['still_frame'])['still_frame']
+
+def getMoviePosters(imdbId):
+  return getMovieImages(imdbId, ['poster'])['poster']
+
+def getMovieTrivia(imdbId):
+  url = "%s/trivia" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  soup = BeautifulSoup(data)
+  trivia = []
+  triviaList = []
+  for i in  soup('ul', {'class': "trivia"}):
+    for t in i('li'):
+      t = str(t).replace('<br />', '').strip()
+      if t.startswith('<li>') and t.endswith('</li>'):
+        t = t[4:-5].strip()
+      trivia.append(t)
+  return trivia

 '''the old code below'''

@ -160,7 +230,6 @@ class IMDb:
    self.keywordUrl = "%skeywords" % self.pageUrl
    self.plotUrl = "%splotsummary" % self.pageUrl
    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
-    self.triviaUrl = "%strivia" % self.pageUrl
    self.locationUrl = "%slocations" % self.pageUrl
    self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl

@ -306,7 +375,7 @@ class IMDb:
    IMDbDict['plot'] = self.parsePlot()
    IMDbDict['keywords'] = self.parseKeywords()

-    IMDbDict['trivia'] = self.parseTrivia()
+    IMDbDict['trivia'] = getMovieTrivia(self.imdb)
    IMDbDict['connections'] = self.parseConnections()
    IMDbDict['locations'] = self.parseLocations()
    IMDbDict['release_date'] = self.parseReleaseinfo()
@ -403,21 +472,6 @@ class IMDb:
    self.keywords = keywords
    return self.keywords

-  def parseTrivia(self):
-    data = getUrlUnicode(self.triviaUrl)
-    soup = BeautifulSoup(data)
-
-    trivia = []
-    triviaList = []
-    for i in  soup('ul', {'class': "trivia"}):
-      for t in i('li'):
-        t = str(t).replace('<br />', '').strip()
-        if t.startswith('<li>') and t.endswith('</li>'):
-          t = t[4:-5].strip()
-        trivia.append(t)
-    self.trivia = trivia
-    return self.trivia
-
  def getConnections(self):
    return getUrlUnicode(self.connectionsUrl)