# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import urllib2 import urllib import re import os import time import ox from ox import findRe, stripTags from ox.normalize import normalizeTitle, normalizeImdbId import ox.cache from ox.cache import readUrl from siteparser import SiteParser import google def readUrl(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None): headers = headers.copy() return ox.cache.readUrl(url, data, headers, timeout) def readUrlUnicode(url, timeout=ox.cache.cache_timeout): return ox.cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout) class Imdb(SiteParser): ''' >>> Imdb('0068646')['title'] u'The Godfather' >>> Imdb('0133093')['title'] u'The Matrix' ''' regex = { 'alternative_titles': { 'page': 'releaseinfo', 're': [ 'name="akas".*?(.*?)', "td>(.*?).*?(.*?)" ], 'type': 'list' }, 'cast': { 'page': 'combined', 're': [ '.*?>(.*?).*?(.*?)', lambda ll: [stripTags(l) for l in ll] ], 'type': 'list' }, 'cinematographers': { 'page': 'combined', 're': [ lambda data: data.split('Series Crew')[0], 'Cinematography by(.*?)', '(.*?)' ], 'type': 'list' }, 'connections': { 'page': 'movieconnections', 're': '
(.*?)
(.*?)\n\n', 'type': 'list' }, 'countries': { 'page': 'combined', 're': [ '
Country:
.*?
', #'(.*?)', #links changed to work with existing caches, just take all links '(.*?)', ], 'type': 'list' }, 'creators': { 'page': 'combined', 're': [ '
Creators:
.*?
(.*?)
', '(.*?)' ], 'type': 'list' }, 'editors': { 'page': 'combined', 're': [ lambda data: data.split('Series Crew')[0], 'Film Editing by(.*?)', '.*?(.*?)', 'type': 'string' }, 'filming_locations': { 'page': 'locations', 're': '(.*?)', 'type': 'list' }, 'genres': { 'page': 'combined', 're': '(.*?)', 'type': 'list' }, 'keywords': { 'page': 'keywords', 're': '(.*?)', 'type': 'list' }, 'languages': { 'page': 'combined', 're': [ '
Language:
.*?
', #'(.*?)', #links changed to work with existing caches, just take all links '(.*?)', ], 'type': 'list' }, 'plot': { 'page': 'plotsummary', 're': '
.*?

(.*?)', 'type': 'string' }, 'poster_id': { 'page': 'combined', 're': '/primary-photo/media/rm(.*?)/tt', 'type': 'string' }, 'poster_ids': { 'page': 'posters', 're': '/unknown-thumbnail/media/rm(.*?)/tt', 'type': 'list' }, 'producers': { 'page': 'combined', 're': [ lambda data: data.split('Series Crew')[0], 'Produced by(.*?)', '(.*?)' ], 'type': 'list' }, 'rating': { 'page': 'combined', 're': '

.*?([\d,.]+?)/10', 'type': 'float' }, 'release date': { 'page': 'releaseinfo', 're': '.*? ', 'type': 'date' }, 'reviews': { 'page': 'externalreviews', 're': [ '
    (.*?)
', '
  • (.*?)
  • ' ], 'type': 'list' }, 'runtime': { 'page': 'combined', 're': '
    Runtime:
    .*?([0-9]+ sec|[0-9]+ min).*?
    ', 'type': 'string' }, 'season': { 'page': 'combined', 're': [ '
    Original Air Date:
    .*?
    (.*?)
    ', '\(Season (\d+), Episode \d+\)', ], 'type': 'int' }, 'episode': { 'page': 'combined', 're': [ '
    Original Air Date:
    .*?
    (.*?)
    ', '\(Season \d+, Episode (\d+)\)', ], 'type': 'int' }, 'series': { 'page': 'combined', 're': '
    TV Series:
    .*?(.*?)
    ', 'type': 'list', }, 'votes': { 'page': 'combined', 're': '
    ([\d,]*?) votes', 'type': 'string' }, 'writers': { 'page': 'combined', 're': [ lambda data: data.split('Series Crew')[0], 'Writing credits(.*?)', '(.*?)' ], 'type': 'list' }, 'year': { 'page': 'combined', 're': '="og:title" content=".*?\((\d{4})\).*?"', 'type': 'int' } } def readUrlUnicode(self, url, timeout): return readUrlUnicode(url, timeout) def __init__(self, id, timeout=-1): #use akas.imdb.com to always get original title: #http://www.imdb.com/help/show_leaf?titlelanguagedisplay self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id super(Imdb, self).__init__(timeout) def is_international_title(t): if 'working title' in t[1].lower(): return False if 'complete title' in t[1].lower(): return False if t[1].lower() =='usa': return True if 'international' in t[1].lower(): return True return False ititle = filter(is_international_title, self.get('alternative_titles', [])) if ititle: self['english_title'] = ititle[0][0] self['title'] = self.get('english_title', self['original_title']) if 'title' in self and self['title'].startswith('"') and self['title'].endswith('"'): self['title'] = self['title'][1:-1] if 'runtime' in self and self['runtime']: if 'min' in self['runtime']: base=60 else: base=1 self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base if 'runtime' in self and not self['runtime']: del self['runtime'] if 'votes' in self: self['votes'] = self['votes'].replace(',', '') if 'connections' in self: cc={} if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring): self['connections'] = [self['connections']] for rel, data in self['connections']: cc[unicode(rel)] = re.compile('').findall(data) self['connections'] = cc for key in ('countries', 'genres'): if key in self: self[key] = filter(lambda x: x.lower() != 'home', self[key]) if 'creators' in self: self['directors'] = self['creators'] del self['creators'] if 'series' in self: if 'episode_title' in self: self['series_title'] = self['title'] self['title'] = "%s: %s" % (self['series_title'], self['episode_title']) if 'episode_title' in self and 'season' in self and 'episode' in self: self['title'] = "%s (S%02dE%02d) %s" % ( self['series_title'], self['season'], self['episode'], self['episode_title']) for key in ('directors', 'year'): if key in self: self['episode_%s'%key] = self[key] series = Imdb(self['series']) for key in ['directors', 'year']: if key in series: self[key] =series[key] else: for key in ('series_title', 'episode_title', 'season', 'episode'): if key in self: del self[key] class ImdbCombined(Imdb): def __init__(self, id, timeout=-1): _regex = {} for key in self.regex: if self.regex[key]['page'] == 'combined': _regex[key] = self.regex[key] self.regex = _regex super(ImdbCombined, self).__init__(id, timeout) def getMovieId(title, director='', year='', timeout=-1): ''' >>> getMovieId('The Matrix') u'0133093' >>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard') u'0060304' >>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967') u'0060304' >>> getMovieId(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard') u'0179214' >>> getMovieId(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard') u'0179214' ''' #print (title, director) imdbId = { (u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514', (u'Wings', u'Larisa Shepitko'): '0061196', (u'The Ascent', u'Larisa Shepitko'): '0075404', (u'Fanny and Alexander', u'Ingmar Bergman'): '0083922', (u'Torment', u'Alf Sj\xf6berg'): '0036914', (u'Crisis', u'Ingmar Bergman'): '0038675', (u'To Joy', u'Ingmar Bergman'): '0043048', (u'Humain, trop humain', u'Louis Malle'): '0071635', (u'Place de la R\xe9publique', u'Louis Malle'): '0071999', (u'God\u2019s Country', u'Louis Malle'): '0091125', }.get((title, director), None) if imdbId: return imdbId params = {'s':'tt','q': title} if director: params['q'] = u'"%s" %s' % (title, director) if year: params['q'] = u'"%s (%s)" %s' % (title, year, director) google_query = "site:imdb.com %s" % params['q'] params['q'] = params['q'].encode('utf-8') params = urllib.urlencode(params) url = "http://akas.imdb.com/find?" + params #print url data = readUrlUnicode(url, timeout=timeout) #if search results in redirect, get id of current page r = '' results = re.compile(r).findall(data) if results: return results[0] #otherwise get first result r = '.*?>> getMoviePoster('0133093') 'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg' >>> getMoviePoster('0994352') 'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg' ''' info = ImdbCombined(imdbId) if 'poster_id' in info: url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['poster_id'], imdbId) data = readUrl(url) poster = findRe(data, 'img id="primary-img".*?src="(.*?)"') return poster elif 'series' in info: return getMoviePoster(info['series']) return '' def guess(title, director='', timeout=-1): return getMovieId(title, director, timeout=timeout) if __name__ == "__main__": import json print json.dumps(Imdb('0306414'), indent=2) #print json.dumps(Imdb('0133093'), indent=2)