# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import urllib2 import urllib import re import os import time import unicodedata import ox from ox import findRe, stripTags from ox.normalize import normalizeTitle, normalizeImdbId import ox.cache from siteparser import SiteParser import google def readUrl(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None): headers = headers.copy() return ox.cache.readUrl(url, data, headers, timeout) def readUrlUnicode(url, timeout=ox.cache.cache_timeout): return ox.cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout) def getUrl(id): return "http://www.imdb.com/title/tt%s/" % id class Imdb(SiteParser): ''' >>> Imdb('0068646')['title'] u'The Godfather' >>> Imdb('0133093')['title'] u'The Matrix' ''' regex = { 'alternativeTitles': { 'page': 'releaseinfo', 're': [ 'name="akas".*?(.*?)', "td>(.*?).*?(.*?)" ], 'type': 'list' }, 'aspectratio': { 'page': 'combined', 're': 'Aspect Ratio:
([\d\.]+)', 'type': 'float', }, 'budget': { 'page': 'business', 're': [ '
Budget
\s*?\$(.*?).*?>(.*?).*?(.*?)', lambda ll: [stripTags(l) for l in ll] ], 'type': 'list' }, 'cinematographer': { 'page': 'combined', 're': [ lambda data: data.split('Series Crew')[0], 'Cinematography by(.*?)', '(.*?)' ], 'type': 'list' }, 'connections': { 'page': 'movieconnections', 're': '
(.*?)
(.*?)\n\n', 'type': 'list' }, 'country': { 'page': 'combined', 're': [ '
Country:
.*?
', #'(.*?)', #links changed to work with existing caches, just take all links '(.*?)', ], 'type': 'list' }, 'creator': { 'page': 'combined', 're': [ '
Creator.?:
.*?
(.*?)
', '(.*?)' ], 'type': 'list' }, '_director': { 'page': 'combined', 're': [ '
Director:
.*?
(.*?)
', '(.*?)' ], 'type': 'list' }, 'episodeTitle': { 'page': 'combined', 're': '
.*?(.*?)', 'type': 'string' }, 'filmingLocations': { 'page': 'locations', 're': '(.*?)', 'type': 'list' }, 'genre': { 'page': 'combined', 're': [ '
Genre:
(.*?)(.*?)' ], 'type': 'list' }, 'gross': { 'page': 'business', 're': [ '
Gross
\s*?\$(.*?)(.*?)', 'type': 'list' }, 'language': { 'page': 'combined', 're': [ '
Language:
.*?
', #'(.*?)', #links changed to work with existing caches, just take all links '(.*?)', ], 'type': 'list' }, 'summary': { 'page': 'plotsummary', 're': '
.*?

(.*?)', 'type': 'string' }, 'posterId': { 'page': 'combined', 're': '/primary-photo/media/rm(.*?)/tt', 'type': 'string' }, 'posterIds': { 'page': 'posters', 're': '/unknown-thumbnail/media/rm(.*?)/tt', 'type': 'list' }, 'producer': { 'page': 'combined', 're': [ lambda data: data.split('Series Crew')[0], 'Produced by(.*?)', '(.*?)' ], 'type': 'list' }, 'rating': { 'page': 'combined', 're': '

.*?([\d,.]+?)/10', 'type': 'float' }, 'releasedate': { 'page': 'releaseinfo', 're': '.*? ', 'type': 'date' }, 'reviews': { 'page': 'externalreviews', 're': [ '
    (.*?)
', '
  • (.*?)
  • ' ], 'type': 'list' }, 'runtime': { 'page': 'combined', 're': '
    Runtime:
    .*?([0-9]+ sec|[0-9]+ min).*?
    ', 'type': 'string' }, 'season': { 'page': 'combined', 're': [ '
    Original Air Date:
    .*?
    (.*?)
    ', '\(Season (\d+), Episode \d+\)', ], 'type': 'int' }, 'episode': { 'page': 'combined', 're': [ '
    Original Air Date:
    .*?
    (.*?)
    ', '\(Season \d+, Episode (\d+)\)', ], 'type': 'int' }, 'series': { 'page': 'combined', 're': '
    TV Series:
    .*?(TV series)', 'type': 'string' }, 'originalTitle': { 'page': 'combined', 're': '

    (.*?) ', 'type': 'string' }, 'trivia': { 'page': 'trivia', 're': '
    (.*?)([\d,]*?) votes', 'type': 'string' }, 'writer': { 'page': 'combined', 're': [ lambda data: data.split('Series Crew')[0], 'Writing credits(.*?)', '(.*?)' ], 'type': 'list' }, 'year': { 'page': 'combined', 're': '="og:title" content=".*?\((\d{4}).*?"', 'type': 'int' } } def readUrlUnicode(self, url, timeout): return readUrlUnicode(url, timeout) def __init__(self, id, timeout=-1): #use akas.imdb.com to always get original title: #http://www.imdb.com/help/show_leaf?titlelanguagedisplay self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id super(Imdb, self).__init__(timeout) url = self.baseUrl + 'combined' page = self.readUrlUnicode(url, timeout=-1) if 'IMDb: Page not found' in page \ or 'The requested URL was not found on our server.' in page: return if "

    We're sorry, something went wrong.

    " in page: time.sleep(1) super(Imdb, self).__init__(0) #only list one country per alternative title def is_international_title(t): if 'script title' in t[1].lower(): return False if 'recut version' in t[1].lower(): return False if 'working title' in t[1].lower(): return False if 'complete title' in t[1].lower(): return False if 'usa (imdb display title)' in t[1].lower(): return True if t[1].lower() == 'usa': return True if 'international (english title)' in t[1].lower(): return True #fails if orignial is english... Japan (English title) #if 'english title' in t[1].lower(): return True return False ititle = filter(is_international_title, self.get('alternativeTitles', [])) if ititle: self['englishTitle'] = ititle[0][0] self['title'] = self.get('englishTitle', self['originalTitle']) for t in ('title', 'englishTitle', 'originalTitle'): if t in self and self[t].startswith('"') and self[t].endswith('"'): self[t] = self[t][1:-1] if 'alternativeTitles' in self: if len(self['alternativeTitles']) == 2 and \ isinstance(self['alternativeTitles'][0], basestring): self['alternativeTitles'] = [self['alternativeTitles']] self['alternativeTitles'] = [[t[0], t[1].split(' / ')[0].split('(')[0].strip()] for t in self['alternativeTitles']] if 'runtime' in self and self['runtime']: if 'min' in self['runtime']: base=60 else: base=1 self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base if 'runtime' in self and not self['runtime']: del self['runtime'] if 'votes' in self: self['votes'] = self['votes'].replace(',', '') if 'cast' in self: if isinstance(self['cast'][0], basestring): self['cast'] = [self['cast']] self['actor'] = [c[0] for c in self['cast']] self['cast'] = map(lambda x: {'actor': x[0], 'character': x[1]}, self['cast']) if 'connections' in self: cc={} if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring): self['connections'] = [self['connections']] for rel, data in self['connections']: #cc[unicode(rel)] = re.compile('(.*?)').findall(data) def get_conn(c): title = c[1] if title.startswith('"') and title.endswith('"'): title = title[1:-1] return { 'id': c[0], 'title': title } cc[unicode(rel)] = map(get_conn, re.compile('(.*?)').findall(data)) self['connections'] = cc for key in ('country', 'genre'): if key in self: self[key] = filter(lambda x: x.lower() != 'home', self[key]) #0092999 if '_director' in self: if 'series' in self or 'isSeries' in self: self['creator'] = self.pop('_director') else: del self['_director'] if 'isSeries' in self: del self['isSeries'] if 'series' in self: if 'episodeTitle' in self: self['seriesTitle'] = self['title'] self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle']) if 'episodeTitle' in self and 'season' in self and 'episode' in self: self['title'] = "%s (S%02dE%02d) %s" % ( self['seriesTitle'], self['season'], self['episode'], self['episodeTitle']) if 'director' in self: self['episodeDirector'] = self['director'] series = Imdb(self['series']) if not 'creator' in series and 'director' in series: series['creator'] = series['director'] if len(series['creator']) > 10: series['creator'] = series['director'][:1] for key in ['creator', 'country']: if key in series: self[key] = series[key] if 'year' in series: self['seriesYear'] = series['year'] if not 'year' in self: self['year'] = series['year'] if 'year' in self: self['episodeYear'] = self['year'] if 'creator' in self: self['seriesDirector'] = self['creator'] if 'originalTitle' in self: del self['originalTitle'] else: for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'): if key in self: del self[key] if 'creator' in self: if 'director' in self: self['episodeDirector'] = self['director'] self['director'] = self['creator'] for key in ('actor', 'writer', 'producer', 'editor'): if key in self: self[key] = sorted(list(set(self[key])), lambda a, b: self[key].index(a) - self[key].index(b)) if 'budget' in self and 'gross' in self: self['profit'] = self['gross'] - self['budget'] if 'releasedate' in self: if isinstance(self['releasedate'], list): self['releasedate'] = min(self['releasedate']) if 'summary' in self: self['summary'] = self['summary'].split('>> getMovieIdByTitle(u'"Father Knows Best" (1954) {(#5.34)}') u'1602860' >>> getMovieIdByTitle(u'The Matrix (1999)') u'0133093' >>> getMovieIdByTitle(u'Little Egypt (1951)') u'0043748' >>> getMovieIdByTitle(u'Little Egypt (1897/I)') u'0214882' >>> getMovieIdByTitle(u'Little Egypt') None >>> getMovieIdByTitle(u'"Dexter" (2006) {Father Knows Best (#1.9)}') u'0866567' ''' params = {'s':'tt','q': title} if isinstance(title, unicode): try: params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1') except: params['q'] = params['q'].encode('utf-8') params = urllib.urlencode(params) url = "http://akas.imdb.com/find?" + params data = readUrlUnicode(url, timeout=timeout) #if search results in redirect, get id of current page r = '' results = re.compile(r).findall(data) if results: return results[0] return None def getMovieId(title, director='', year='', timeout=-1): ''' >>> getMovieId('The Matrix') u'0133093' >>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard') u'0060304' >>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967') u'0060304' >>> getMovieId(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard') u'0179214' >>> getMovieId(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard') u'0179214' ''' imdbId = { (u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514', (u'Wings', u'Larisa Shepitko'): '0061196', (u'The Ascent', u'Larisa Shepitko'): '0075404', (u'Fanny and Alexander', u'Ingmar Bergman'): '0083922', (u'Torment', u'Alf Sj\xf6berg'): '0036914', (u'Crisis', u'Ingmar Bergman'): '0038675', (u'To Joy', u'Ingmar Bergman'): '0043048', (u'Humain, trop humain', u'Louis Malle'): '0071635', (u'Place de la R\xe9publique', u'Louis Malle'): '0071999', (u'God\u2019s Country', u'Louis Malle'): '0091125', (u'Flunky, Work Hard', u'Mikio Naruse'): '0022036', (u'The Courtesans of Bombay', u'Richard Robbins') : '0163591', (u'Je tu il elle', u'Chantal Akerman') : '0071690', (u'Hotel Monterey', u'Chantal Akerman') : '0068725', (u'No Blood Relation', u'Mikio Naruse') : '023261', (u'Apart from You', u'Mikio Naruse') : '0024214', (u'Every-Night Dreams', u'Mikio Naruse') : '0024793', (u'Street Without End', u'Mikio Naruse') : '0025338', (u'Sisters of the Gion', u'Kenji Mizoguchi') : '0027672', (u'Osaka Elegy', u'Kenji Mizoguchi') : '0028021', (u'Blaise Pascal', u'Roberto Rossellini') : '0066839', (u'Japanese Girls at the Harbor', u'Hiroshi Shimizu') : '0160535', (u'The Private Life of Don Juan', u'Alexander Korda') : '0025681', (u'Last Holiday', u'Henry Cass') : '0042665', (u'A Colt Is My Passport', u'Takashi Nomura') : '0330536', (u'Androcles and the Lion', u'Chester Erskine') : '0044355', (u'Major Barbara', u'Gabriel Pascal') : '0033868', (u'Come On Children', u'Allan King') : '0269104', (u'Jimi Plays Monterey & Shake! Otis at Monterey', u'D. A. Pennebaker and Chris Hegedus') : '', (u'Martha Graham: Dance on Film', u'Nathan Kroll') : '', (u'Carmen', u'Carlos Saura'): '0085297', (u'The Story of a Cheat', u'Sacha Guitry'): '0028201', (u'Weekend', 'Andrew Haigh'): '1714210', }.get((title, director), None) if imdbId: return imdbId params = {'s':'tt','q': title} if director: params['q'] = u'"%s" %s' % (title, director) if year: params['q'] = u'"%s (%s)" %s' % (title, year, director) google_query = "site:imdb.com %s" % params['q'] if isinstance(params['q'], unicode): try: params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1') except: params['q'] = params['q'].encode('utf-8') params = urllib.urlencode(params) url = "http://akas.imdb.com/find?" + params #print url data = readUrlUnicode(url, timeout=timeout) #if search results in redirect, get id of current page r = '' results = re.compile(r).findall(data) if results: return results[0] #otherwise get first result r = '.*?>> getMoviePoster('0133093') 'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg' >>> getMoviePoster('0994352') 'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg' ''' info = ImdbCombined(imdbId) if 'posterId' in info: url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId) data = readUrl(url) poster = findRe(data, 'img id="primary-img".*?src="(.*?)"') return poster elif 'series' in info: return getMoviePoster(info['series']) return '' def maxVotes(): url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc' data = ox.cache.readUrl(url) votes = max([int(v.replace(',', '')) for v in re.compile('([\d,]+)').findall(data)]) return votes def guess(title, director='', timeout=-1): return getMovieId(title, director, timeout=timeout) if __name__ == "__main__": import json print json.dumps(Imdb('0306414'), indent=2) #print json.dumps(Imdb('0133093'), indent=2)