scrapeit/scrapeit/imdb.py

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2

import urllib2
from urllib import quote
import re, time
import os

from elementtree.ElementTree import parse, tostring
from BeautifulSoup import BeautifulSoup

from google import google
from utils import stripTags, read_url_utf8, htmldecode

import utils

def read_url(url):
  base = "/var/cache/scrapeit/cache/"
  path = os.path.join(base, url.replace('http://',''))
  if path.endswith('/'):
    path = "%sindex.html" % path
  if os.path.isdir(path):
    path = "%s/index.html" % path
  if os.path.exists(path):
    f = open(path)
    data = f.read()
    f.close()
    return data
  else:
    data = utils.read_url(url)
    folder = os.path.dirname(path)
    if not os.path.exists(folder):
      os.makedirs(folder)
    f = open(path, 'w')
    f.write(data)
    f.close()
    return data

def  _get_data(url):
  data = None
  try:
    data = read_url(url)
  except:
    print "error reading data from", url
  return data

def get_image(url):
  return read_url(url)

def _castList(data, regexp):
  soup = re.compile(regexp).findall(data)
  if soup:
    soup = BeautifulSoup(soup[0])
    names = []
    for i in soup('a', {'href': re.compile('/name/nm')}):
      if i.string:
        cast = stripTags(i.string)
        if cast not in names:
          names.append(cast)
    return names
  return []

def _getTerm(data, regexp):
  term = ''
  try:
    reg = re.compile(regexp, re.IGNORECASE)
    m = reg.search(data)
    if m:
      term = stripTags(m.group(1)).strip()
  except:
    print "waring, parsing failed for", regexp
  return term.encode('utf8')


class IMDb:
  def __init__(self, imdb):
    self.imdb = imdb
    self.pageSource = None
    self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb

    self.businessSource = None
    self.businessUrl = "%sbusiness" % self.pageUrl
    self.connectionsSource = None
    self.connectionsUrl = "%smovieconnections" % self.pageUrl
    self.creditsSource = None
    self.creditsUrl = "%sfullcredits" % self.pageUrl
    self.episodesSource = None
    self.episodesUrl = "%sepisodes" % self.pageUrl
    self.keywordSource = None
    self.keywordUrl = "%skeywords" % self.pageUrl
    self.plotSource = None
    self.plotUrl = "%splotsummary" % self.pageUrl
    self.releaseinfoSource = None
    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
    self.triviaSource = None
    self.triviaUrl = "%strivia" % self.pageUrl

  def getPage(self, forcereload = False):
    if forcereload or not self.pageSource:
      self.pageSource = read_url(self.pageUrl)
    return self.pageSource

  def parse_raw_value(self, key, value):
    if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
      value = stripTags(value).strip()
    if key == 'runtime':
      parsed_value = _getTerm(value, '(.*?) min')
      parsed_value = _getTerm(parsed_value, '([0-9]+)')
      if not parsed_value:
        parsed_value = _getTerm(value, '(.*?) sec')
        parsed_value = _getTerm(parsed_value, '([0-9]+)')
        if not parsed_value:
          parsed_value = 0
        else:
          parsed_value = int(parsed_value)
      else:
        parsed_value = int(parsed_value) * 60
    elif key in ('country', 'language'):
      parsed_value = value.split(' / ')
    elif key == 'genre':
      parsed_value = value.replace('more', '').strip().split(' / ')
    elif key == 'tagline':
      parsed_value = value.replace('more', '').strip()
    elif key == 'plot_outline':
      parsed_value = value.replace('(view trailer)', '').strip()
      if parsed_value.endswith('more'):
        parsed_value = parsed_value[:-4].strip()
    elif key == 'tv_series':
      m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
      if m:
        parsed_value = m[0][0]
      else:
        parsed_value = ''
    else:
      print value
      parsed_value = value
    return parsed_value

  def parse(self):
    data = self.getPage()
    IMDbDict ={}
    #Poster
    IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
    if not IMDbDict['poster']:
      IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
    #Title, Year
    title = u''
    year  = u''
    flat_data = data.replace('\n', '').replace('\r', '')
    html_title = re.compile('<strong class="title">(.*?) <small>\(<a href="/Sections/Years/(.*?)">').findall(flat_data)
    if html_title:
      title = html_title[0][0]
      IMDbDict['year'] = html_title[0][1]
      IMDbDict['title'] = stripTags(title).strip()
    else:
      title = _getTerm(data, '<title>(.*?)</title>').split('(')
      year = title[-1].split(')')[0].strip()
      title = title[0].strip().decode('utf-8')
      IMDbDict['title'] = title
      IMDbDict['year']  = year
    IMDbDict['title'] = htmldecode(IMDbDict['title'])
    if IMDbDict['title'][0] == '"' and  IMDbDict['title'][-1] == '"':
      IMDbDict['title'] =  IMDbDict['title'][1:-1]

    #Votes
    m = re.compile('<b>(.*?)/10</b> \(<a href="ratings">(.*?) votes</a>\)', re.IGNORECASE).search(data)
    if m:
      IMDbDict['rating'] = int(float(m.group(1)) * 1000)
      IMDbDict['votes'] = int(m.group(2).replace(',', ''))
    else:
      IMDbDict['rating'] = -1
      IMDbDict['votes'] = -1

    data = data.replace('\n',' ')
    #some values
    keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series')
    for key in keys:
      IMDbDict[key] = ''
    IMDbDict['runtime'] = 0
    soup = BeautifulSoup(data)
    for info in soup('div', {'class': 'info'}):
      key = str(info).split('</h5>')[0].split('<h5>')
      if len(key) > 1:
        raw_value = str(info).split('</h5>')[1]
        key = key[1][:-1].lower().replace(' ', '_')
        if key in keys:
          IMDbDict[key] = self.parse_raw_value(key, raw_value)

    #is episode
    IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')

    IMDbDict['episodes'] = self.parseEpisodes()
    IMDbDict['credits'] = self.parseCredits()
    IMDbDict['plot'] = self.parsePlot()
    IMDbDict['keywords'] = self.parseKeywords()

    IMDbDict['trivia'] = self.parseTrivia()
    IMDbDict['connections'] = self.parseConnections()
    IMDbDict['release_date'] = self.parseReleaseinfo()
    IMDbDict['business'] = self.parseBusiness()
    self.IMDbDict = IMDbDict
    return self.IMDbDict

  def getCredits(self, forcereload = False):
    if forcereload or not self.creditsSource:
      self.creditsSource = read_url(self.creditsUrl)
    return self.creditsSource

  def parseCredits(self):
    data = self.getCredits()
    credits = {}
    credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
    credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
    credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
    #credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
    credits['cast'] = []
    soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
    soup = BeautifulSoup(data)
    cast = soup('table', {'class': 'cast'})
    if cast:
      cast = str(cast[0])
      names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
      for name in names:
        real_name = name[0]
        role_name = name[1]
        if role_name:
          role_name = role_name.split('(')[0].replace('/ ...','').strip()
        credits['cast'].append((stripTags(real_name), stripTags(role_name)))
    self.credits = credits
    return self.credits

  def getPlot(self, forcereload = False):
    if forcereload or not self.plotSource:
      self.plotSource = read_url(self.plotUrl)
    return self.plotSource

  def parsePlot(self):
    soup = BeautifulSoup(self.getPlot())
    plot = soup('p', {'class':'plotpar'})
    if plot:
      plot = str(plot[0]).split('<i>')[0]
    else:
      plot = u''
    plot = stripTags(plot).strip()
    self.plot = plot
    return plot

  def getEpisodes(self, forcereload = False):
    if forcereload or not self.episodesSource:
      self.episodesSource = read_url(self.episodesUrl)
    return self.episodesSource

  def parseEpisodes(self):
    episodes = {}
    cdata = self.getEpisodes().replace('\r\n',' ')
    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>'''
    #regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
    reg = re.compile(regexp, re.IGNORECASE)
    m = reg.findall(cdata)
    for match in m:
      try:
        episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
        episodes[episode] = {}
        episodes[episode]['imdb'] = match[2]
        episodes[episode]['title'] = match[3].strip()
        description = htmldecode(match[4])
        description = stripTags(description.split('Next US airings:')[0])
        episodes[episode]['description'] = description
      except:
        import traceback
        print traceback.print_exc()
        pass
    self.episodes = episodes
    return self.episodes

  def getKeywords(self, forcereload = False):
    if forcereload or not self.keywordSource:
      self.keywordSource = read_url(self.keywordUrl)
    return self.keywordSource

  def parseKeywords(self):
    soup = BeautifulSoup(self.getKeywords())
    keywords = []
    for key in soup('a', {'href': re.compile('/keyword')}):
      keywords.append(htmldecode(key.string))
    self.keywords = keywords
    return self.keywords

  def getTrivia(self, forcereload = False):
    if forcereload or not self.triviaSource:
      self.triviaSource = read_url(self.triviaUrl)
    return self.triviaSource

  def parseTrivia(self):
    trivia = []
    soup = BeautifulSoup(self.getTrivia())
    triviaList = []
    for i in  soup('ul', {'class': "trivia"}):
      for t in i('li'):
        t = str(t).replace('<br />', '').strip()
        if t.startswith('<li>') and t.endswith('</li>'):
          t = t[4:-5].strip()
        trivia.append(t)
    self.trivia = trivia
    return self.trivia

  def getConnections(self, forcereload = False):
    if forcereload or not self.connectionsSource:
      self.connectionsSource = read_url(self.connectionsUrl)
    return self.connectionsSource

  def parseConnections(self):
    connections = {}
    soup = BeautifulSoup(self.getConnections())
    content = soup('div', {'id': 'tn15content'})[0]
    blocks = str(content).split('<h5>')[1:]
    for c in blocks:
      connection = c.split('</h5>')[0]
      cs = BeautifulSoup(c)
      if connection:
        #relation -> list of imdb ids
        connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
    return connections

  def getReleaseinfo(self, forcereload = False):
    if forcereload or not self.releaseinfoSource:
      self.releaseinfoSource = read_url(self.releaseinfoUrl)
    return self.releaseinfoSource

  def parseReleaseinfo(self):
    soup = BeautifulSoup(self.getReleaseinfo())
    for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'):
      d = row('td', {'align':'right'})
      if d:
        try:
          possible_date = stripTags(str(d[0])).strip()
          rdate = time.strptime(possible_date, "%d %B %Y")
          rdate = time.strftime('%Y-%m-%d', rdate)
          return rdate
        except:
          pass
    return None

  def getBusiness(self, forcereload = False):
    if forcereload or not self.businessSource:
      self.businessSource = read_url(self.businessUrl)
    return self.businessSource

  def parseBusiness(self):
    soup = BeautifulSoup(self.getBusiness())
    business = {'budget': 0, 'gross': 0, 'profit': 0}
    content = soup('div', {'id': 'tn15content'})[0]
    blocks = str(content).split('<h5>')[1:]
    for c in blocks:
      cs = BeautifulSoup(c)
      line = c.split('</h5>')
      if line:
        title = line[0]
        line = line[1]
        if title in ['Budget', 'Gross']:
          values = re.compile('\$(.*?) ').findall(line)
          values = [int(value.replace(',','')) for value in values]
          if values:
            business[title.lower()] = max(values)
    if business['budget'] and business['gross']:
      business['profit'] = business['gross'] - business['budget']
    return business

def guess(title, director=''):
  #FIXME: proper file -> title
  title = title.split('-')[0]
  title = title.split('(')[0]
  title = title.split('.')[0]
  title = title.strip()
  imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
  return_url = ''

  #lest first try google
  #i.e. site:imdb.com Michael Stevens Sin
  if director:
    search = 'site:imdb.com %s "%s"' % (director, title)
  else:
    search = 'site:imdb.com "%s"' % title
  for (name, url, desc) in google(search, 1):
    if url.startswith('http://www.imdb.com/title/tt'):
      return url[28:35]

  req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
  u = urllib2.urlopen(req)
  data = u.read()
  return_url = u.url
  u.close()

  if return_url.startswith('http://www.imdb.com/title/tt'):
    return return_url[28:35]
  if data:
    imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
    if imdb_id:
      return imdb_id

  imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
  req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
  u = urllib2.urlopen(req)
  data = u.read()
  return_url = u.url
  u.close()
  if return_url.startswith('http://www.imdb.com/title/tt'):
    return return_url[28:35]

  return None

def getEpisodeData(title, episode, show_url = None):
  '''
    Collect information about an episode.

    Returns dict with title, show, description and episode
  '''
  episodeData = {
    'title': u'',
    'show': title,
    'description': u'',
    'episode': episode,
  }
  description = u''
  if not show_url:
    imdbid = guess(title)
  else:
    imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
  if imdbid:
    i = IMDb(imdbid).parse()
    episodeData['title'] = i['episodes'][episode]['title']
    episodeData['description'] = i['episodes'][episode]['description']
    episodeData['imdb'] = i['episodes'][episode]['imdb']
  return episodeData


if __name__ == '__main__':
  import sys
  #print parse(sys.argv[1])
  print "imdb:", guess(sys.argv[1])