python-oxweb/ox/imdb.py

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2

from oxutils import *
import urllib2
from urllib import quote, unquote
import re, time
import os
import time

from BeautifulSoup import BeautifulSoup
import chardet
import oxutils
from oxutils import stripTags, htmldecode, findRegexp, findString
from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle, normalizeImdbId

import google

def getMovieId(title, director='', year=''):
  if year:
    title = "%s (%s)" % (title, year)
  if director:
    query = 'site:imdb.com %s "%s"' % (director, title)
  else:
    query = 'site:imdb.com "%s"' % title
  for (name, url, desc) in google.find(query, 3):
    if url.startswith('http://www.imdb.com/title/tt'):
      return url[28:35]

def getMovieData(imdbId):
  return IMDb(imdbId).parse()

# internal functions below
def getUrlBase(imdbId):
  return "http://www.imdb.com/title/tt%s" % imdbId

def getRawMovieData(imdbId):
  imdbId = normalizeImdbId(imdbId)
  data = getMovieInfo(imdbId)
  data['credits'] = getMovieCredits(imdbId)
  data['poster'] = getMoviePoster(imdbId)
  data['connections'] = getMovieConnections(imdbId)
  data['company credits'] = getMovieCompanyCredits(imdbId)
  data['filming locations'] = getMovieLocations(imdbId)
  data['movie connections'] = getMovieConnections(imdbId)
  data['external reviews'] = getMovieExternalReviews(imdbId)
  data['trivia'] = getMovieTrivia(imdbId)
  data['keywords'] = getMovieKeywords(imdbId)
  data['media'] = {}
  data['media']['images'] = getMovieImages(imdbId)
  data['media']['trailers'] = getMovieTrailers(imdbId)
  return data

def getMovieInfo(imdbId):
  data = getUrl(getUrlBase(imdbId))
  soup = BeautifulSoup(data)
  info = dict()
  info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')

  for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
    title = stripTags(i[0]).strip().lower()
    txt= stripTags(i[1]).strip()
    def cleanUp(k):
      k = htmldecode(k).replace(u'\xa0', ' ').strip()
      if k.endswith('more'): k=k[:-len('more')].strip()
      return k
    txt = cleanUp(txt)
    if title not in ('plot', 'trivia', 'filming locations', 'mpaa'):
      if '|' in txt:
        txt = [cleanUp(k) for k in txt.split('|')]
      elif ', ' in txt:
        txt = [cleanUp(k) for k in txt.split(', ')]
    if not title.startswith('moviemeter'):
      info[title] = txt
  for key in ('user comments', 'writers (wga)'):
   if key in info:
    del info[key]
  if 'release date' in info:
    info['release date'] = info['release date'].split('\n')[0]
  if 'plot' in info:
    info['plot'] = info['plot'].split('| add synopsis')[0].strip()

  #get Title
  title = ''
  year = ''
  html_title = soup('div', {'id': 'tn15title'})
  if not html_title:
    html_title = soup('title')
  if html_title:
    html_title = str(html_title[0])
    html_title = html_title.replace('<br />', ' ').replace('  ', ' ')
    title = htmldecode(html_title)
    title = stripTags(title)
    year = findRegexp(title, '\((\d{4})\)')
    if not year:
      year = findRegexp(title, '\((\d{4})')
    title = re.sub('\(\d{4}\)', '', title)
    title = re.sub('\(\d{4}/I*\)', '', title)
    for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
      title = title.replace(t, '')
  title = title.strip()
  if title.find(u'\xa0') > -1:
    title = title[:title.find(u'\xa0')].strip()
  if title.startswith('"') and title.endswith('"'):
    title = title[1:-1]
  info['title'] = title
  info['year'] = year
  '''
  #Rating
  rating = findRegexp(data, '<b>(.*?)/10</b>')
  if rating:
    info['rating'] = int(float(rating) * 1000)
  else:
    info['rating'] = -1

  #Votes
  votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
  if votes:
    info['votes'] = int(votes.replace(',', ''))
  else:
    info['votes'] = -1
  '''
  return info

def getMoviePoster(imdbId):
  info = getMovieInfo(imdbId)
  return info['poster']

def getMovieYear(imdbId):
  info = getMovieInfo(imdbId)
  return info['year']

def getMovieTitle(imdbId):
  info = getMovieInfo(imdbId)
  return info['title']

def creditList(data, section=None):
  if section == 'cast':
    credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
  else:
    credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
  credits = []
  for c_ in credits_:
    c = [c_[0].strip(), c_[1].strip()]
    if section=='writers':
      c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
      if c[1].endswith(' and'): c[1] = c[1][:-4]
    credits.append(c)
  return credits

def getMovieCredits(imdbId):
  credits = dict()
  url = "%s/fullcredits" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  groups = data.split('<h5>')
  for g in groups:
    section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
    if section:
      credits[section[0]] = creditList(g, section[0])
  return credits

def getMovieTrailers(imdbId):
  url = "%s/trailers" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  videos = soup('div', {'class':"video-gallery"})
  trailers = []
  if videos:
    for a in videos[0]('a'):
      title = stripTags(unicode(a)).strip()
      url = 'http://www.imdb.com' + a['href']
      videoId = findRegexp(url, '/(vi\d*?)/')
      iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
      iframe = getUrlUnicode(iframeUrl)
      videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
      trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
  return trailers

def getMovieQuotes(imdbId):
  url = "%s/quotes" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
  quotes = [(q[0].strip(),q[1].strip())  for q in quotes]
  return quotes

def getMovieTechnical(imdbId):
  url = "%s/technical" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  results = {}
  for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
    results[t[0].strip()] = t[1].strip()
  return results

def getMovieCompanyCredits(imdbId):
  url = "%s/companycredits" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  results = {}
  for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
    results[field.strip()] = []
    for company in re.compile('<li>(.*?)</li>').findall(c):
      results[field.strip()].append(company)
  return results

def getMovieLocations(imdbId):
  url = "%s/locations" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  locations = []
  for key in soup('a', {'href': re.compile('^/List')}):
    locations.append(htmldecode(key.string))
  return locations

def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
  photos = {}
  for key in keys:
    url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
    data = getUrlUnicode(url)
    photos[key] = {}
    for s in  re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
      img= "%s.jpg" % s[1].split('._V')[0]
      title = s[0]
      if key=='still_frame':
        if not "_CR0" in s[1]:
          photos[key][img] = title
      else:
        photos[key][img] = title
  return photos

def getMovieStills(imdbId):
  return getMovieImages(imdbId, ['still_frame'])['still_frame']

def getMoviePosters(imdbId):
  return getMovieImages(imdbId, ['poster'])['poster']

def getMovieTrivia(imdbId):
  url = "%s/trivia" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  trivia = []
  triviaList = []
  for i in  soup('ul', {'class': "trivia"}):
    for t in i('li'):
      t = str(t).replace('<br />', '').strip()
      if t.startswith('<li>') and t.endswith('</li>'):
        t = t[4:-5].strip()
      trivia.append(t)
  return trivia

def getMovieConnections(imdbId):
  url = "%s/movieconnections" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  connections = {}
  content = soup('div', {'id': 'tn15content'})[0]
  blocks = str(content).split('<h5>')[1:]
  for c in blocks:
    connection = c.split('</h5>')[0]
    cs = BeautifulSoup(c)
    if connection:
      #relation -> list of imdb ids
      connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
  return connections

def getMovieKeywords(imdbId):
  url = "%s/keywords" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  keywords = []
  for key in soup('a', {'href': re.compile('^/keyword/')}):
    k = htmldecode(key.string)
    k = k.replace(u'\xa0', ' ')
    keywords.append(k)
  return keywords


def getMovieExternalReviews(imdbId):
  url = "%s/externalreviews" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  ol = soup('ol')
  if ol:
    ol = ol[0]
    ret = {}
    for li in ol('li'):
      try:
        a = li('a')[0]
        href = a.get('href')
        txt = a.contents[0]
        ret[href] = txt
      except:
        pass
    return ret
  return {}

'''the old code below'''

class IMDb:
  def __init__(self, imdbId):
    self.imdb = imdbId
    self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb

    self.businessUrl = "%sbusiness" % self.pageUrl
    self.creditsUrl = "%sfullcredits" % self.pageUrl
    self.episodesUrl = "%sepisodes" % self.pageUrl
    self.plotUrl = "%splotsummary" % self.pageUrl
    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl

  def getPage(self):
    return getUrlUnicode(self.pageUrl)

  def parse_raw_value(self, key, value):
    if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
      value = unicode(value, 'utf-8')
      value = stripTags(value).strip()
    if key == 'runtime':
      parsed_value = findRegexp(value, '(.*?) min')
      parsed_value = findRegexp(parsed_value, '([0-9]+)')
      if not parsed_value:
        parsed_value = findRegexp(value, '(.*?) sec')
        parsed_value = findRegexp(parsed_value, '([0-9]+)')
        if not parsed_value:
          parsed_value = 0
        else:
          parsed_value = int(parsed_value)
      else:
        parsed_value = int(parsed_value) * 60
    elif key in ('country', 'language'):
      parsed_value = value.split(' / ')
      parsed_value = [v.strip() for v in parsed_value]
    elif key == 'genre':
      parsed_value = value.replace('more', '').strip().split(' / ')
      parsed_value = [v.strip() for v in parsed_value]
    elif key == 'tagline':
      parsed_value = value.replace('more', '').strip()
    elif key == 'plot_outline':
      parsed_value = value.replace('(view trailer)', '').strip()
      if parsed_value.endswith('more'):
        parsed_value = parsed_value[:-4].strip()
    elif key == 'tv_series':
      m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
      if m:
        parsed_value = m[0][0]
      else:
        parsed_value = ''
    elif key == 'also_known_as':
      parsed_value = ''
      m = re.compile('(.*) \(International: English title').findall(value)
      if m:
        parsed_value = m[0]
      else:
        m = re.compile('(.*) \(USA').findall(value)
        if m:
          parsed_value = m[0]
      parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
      director = self.getCredits().get('director', None)
      if director:
        director = director[0]
        parsed_value = parsed_value.replace(director, '')
      if parsed_value.startswith("'s"):
        parsed_value = parsed_value[2:].strip()
      parsed_value = parsed_value.strip()
    else:
      print value
      parsed_value = value
    return parsed_value

  def parseTitle(self):
    title = getMovieTitle(self.imdb)
    title = normalizeTitle(title)
    if title.startswith('"') and title.find('"',1) > 0 and \
      title.find('"',1) == title.rfind('"'):
      se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
      if se:
        se = se[0]
        se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
        title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
      else:
        title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
    return normalizeTitle(title)

  def parseYear(self):
    year = ''
    data = self.getPage()
    soup = BeautifulSoup(data)
    html_title = soup('div', {'id': 'tn15title'})
    if not html_title:
      html_title = soup('title')
    if html_title:
      html_title = str(html_title[0])
      html_title = stripTags(html_title)
      year = re.compile('\((\d{4})\)').findall(html_title)
      if not year:
        year = re.compile('\((\d{4})/').findall(html_title)
      if year:
        year = year[0]
      else: year = ''
    return year

  def parse(self):
    data = self.getPage()
    IMDbDict ={}
    #Poster
    IMDbDict['poster'] = getMoviePoster(self.imdb)
    if not IMDbDict['poster']:
      IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
    #Title, Year
    IMDbDict['year'] = self.parseYear()
    IMDbDict['title'] = self.parseTitle()

    #Rating
    m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
    if m:
      IMDbDict['rating'] = int(float(m.group(1)) * 1000)
    else:
      IMDbDict['rating'] = -1
    #Votes
    m = re.compile('<small>\(<a href="ratings">(.*?) votes</a>\)</small>', re.IGNORECASE).findall(data)
    if m:
      IMDbDict['votes'] = int(m[0].replace(',', ''))
    else:
      IMDbDict['votes'] = -1

    data = data.replace('\n',' ')
    #some values
    keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
    for key in keys:
      IMDbDict[key] = ''
    IMDbDict['runtime'] = 0
    soup = BeautifulSoup(data)
    for info in soup('div', {'class': 'info'}):
      key = str(info).split('</h5>')[0].split('<h5>')
      if len(key) > 1:
        raw_value = str(info).split('</h5>')[1]
        key = key[1][:-1].lower().replace(' ', '_')
        if key in keys:
          IMDbDict[key] = self.parse_raw_value(key, raw_value)
    IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
    #is episode
    IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')

    IMDbDict['episodes'] = self.parseEpisodes()
    if IMDbDict['episodes']:
      IMDbDict['tvshow'] = True
    else:
      IMDbDict['tvshow'] = False
    IMDbDict['credits'] = self.getCredits()
    IMDbDict['plot'] = self.parsePlot()
    IMDbDict['keywords'] = getMovieKeywords(self.imdb)

    IMDbDict['trivia'] = getMovieTrivia(self.imdb)
    IMDbDict['connections'] = getMovieConnections(self.imdb)
    IMDbDict['locations'] = getMovieLocations(self.imdb)
    IMDbDict['release_date'] = self.parseReleaseinfo()
    IMDbDict['business'] = self.parseBusiness()
    IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
    IMDbDict['stills'] = getMovieStills(self.imdb)
    #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
    self.IMDbDict = IMDbDict

    if IMDbDict['episode_of']:
      episode_of =IMDb(IMDbDict['episode_of']).parse()
      for key in ('country', 'language'):
        if not IMDbDict[key]:
          IMDbDict[key] = episode_of[key]
    return self.IMDbDict

  def getCredits(self):
    raw_credits = getCredits(self.imdb)
    credits = {}

    def getNames(creditList):
      return [stripTags(c[0]) for c in creditList]

    credits['director'] = getNames(raw_credits['directors'])
    credits['writer'] = getNames(raw_credits['writers'])
    credits['producer'] = getNames(raw_credits['producers'])
    credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']]

    self.credits = credits
    return self.credits

  def parsePlot(self):
    data = getUrlUnicode(self.plotUrl)
    soup = BeautifulSoup(data)
    plot = soup('p', {'class':'plotpar'})
    if plot:
      plot = unicode(plot[0]).split('<i>')[0]
    else:
      plot = u''
    plot = stripTags(plot).strip()
    self.plot = plot
    return plot

  def parseEpisodes(self):
    episodes = {}
    data = getUrlUnicode(self.episodesUrl)
    cdata = data.replace('\r\n', ' ')
    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
    reg = re.compile(regexp, re.IGNORECASE)
    m = reg.findall(cdata)
    for match in m:
      try:
        episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
        episodes[episode] = {}
        episodes[episode]['imdb'] = match[2]
        episodes[episode]['title'] = match[3].strip()
        if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
          episodes[episode]['title'] = u''
        description = htmldecode(match[5])
        description = stripTags(description.split('Next US airings:')[0])
        episodes[episode]['description'] = description
        episodes[episode]['date'] = ''
        try:
          d = stripTags(match[4])
          d = d.replace('Original Air Date: ', '')
          d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
          episodes[episode]['date'] = d
        except:
          pass
      except:
        import traceback
        print traceback.print_exc()
        pass
    self.episodes = episodes
    return self.episodes

  def getReleaseinfo(self):
    return getUrlUnicode(self.releaseinfoUrl)

  def parseReleaseinfo(self):
    soup = BeautifulSoup(self.getReleaseinfo())
    info = soup('table',{'border': '0', 'cellpadding':'2'})
    if info:
      for row in info[0]('tr'):
        d = row('td', {'align':'right'})
        if d:
          try:
            possible_date = stripTags(str(d[0])).strip()
            rdate = time.strptime(possible_date, "%d %B %Y")
            rdate = time.strftime('%Y-%m-%d', rdate)
            return rdate
          except:
            pass
    return None

  def getBusiness(self):
    return getUrlUnicode(self.businessUrl)

  def parseBusiness(self):
    soup = BeautifulSoup(self.getBusiness())
    business = {'budget': 0, 'gross': 0, 'profit': 0}
    content = soup('div', {'id': 'tn15content'})[0]
    blocks = str(content).split('<h5>')[1:]
    for c in blocks:
      cs = BeautifulSoup(c)
      line = c.split('</h5>')
      if line:
        title = line[0]
        line = line[1]
        if title in ['Budget', 'Gross']:
          values = re.compile('\$(.*?) ').findall(line)
          values = [int(value.replace(',','')) for value in values]
          if values:
            business[title.lower()] = max(values)
    if business['budget'] and business['gross']:
      business['profit'] = business['gross'] - business['budget']
    return business

def guess(title, director=''):
  #FIXME: proper file -> title
  title = title.split('-')[0]
  title = title.split('(')[0]
  title = title.split('.')[0]
  title = title.strip()
  imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
  return_url = ''

  #lest first try google
  #i.e. site:imdb.com Michael Stevens Sin
  if director:
    search = 'site:imdb.com %s "%s"' % (director, title)
  else:
    search = 'site:imdb.com "%s"' % title
  for (name, url, desc) in google.find(search, 2):
    if url.startswith('http://www.imdb.com/title/tt'):
      return url[28:35]

  try:
    req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
    u = urllib2.urlopen(req)
    data = u.read()
    return_url = u.url
    u.close()
  except:
    return None
  if return_url.startswith('http://www.imdb.com/title/tt'):
    return return_url[28:35]
  if data:
    imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
    if imdb_id:
      return imdb_id

  imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
  req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
  u = urllib2.urlopen(req)
  data = u.read()
  return_url = u.url
  u.close()
  if return_url.startswith('http://www.imdb.com/title/tt'):
    return return_url[28:35]

  return None

def getEpisodeData(title, episode, show_url = None):
  '''
    Collect information about an episode.

    Returns dict with title, show, description and episode
  '''
  episodeData = {
    'title': u'',
    'show': title,
    'description': u'',
    'episode': episode,
  }
  description = u''
  if not show_url:
    imdbid = guess(title)
  else:
    imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
  if imdbid:
    i = IMDb(imdbid).parse()
    episodeData['title'] = i['episodes'][episode]['title']
    episodeData['description'] = i['episodes'][episode]['description']
    episodeData['imdb'] = i['episodes'][episode]['imdb']
  return episodeData

if __name__ == '__main__':
  import sys
  #print parse(sys.argv[1])
  print "imdb:", guess(sys.argv[1])