(.*?)<(br|/div)',
lambda data: data[0]
],
'type': 'list',
},
'votes': {
'page': 'combined',
're': '
([\d,]*?) votes',
'type': 'string'
},
'writer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Writing credits(.*?)',
'
(.*?)'
],
'type': 'list'
},
'year': {
'page': 'combined',
're': '="og:title" content="[^"]*?\((\d{4}).*?"',
'type': 'int'
},
'credits': {
'page': 'fullcredits',
're': [
lambda data: data.split('
(.*?)
.*?(
)',
lambda data: [d for d in data if d]
],
'type': 'list'
},
}
def read_url(self, url, timeout):
if not url in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url]
def __init__(self, id, timeout=-1):
# use akas.imdb.com to always get original title:
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
super(Imdb, self).__init__(timeout)
url = self.baseUrl + 'combined'
page = self.read_url(url, timeout=-1)
if 'IMDb: Page not found' in page \
or 'The requested URL was not found on our server.' in page:
return
if "We're sorry, something went wrong.
" in page:
time.sleep(1)
super(Imdb, self).__init__(0)
if 'alternativeTitles' in self:
if len(self['alternativeTitles']) == 2 and \
isinstance(self['alternativeTitles'][0], string_types):
self['alternativeTitles'] = [self['alternativeTitles']]
#normalize country names
if 'country' in self:
self['country'] = [normalize_country_name(c) or c for c in self['country']]
if 'sound' in self:
self['sound'] = list(set(self['sound']))
types = {}
stop_words = [
'alternative spelling',
'alternative title',
'alternative transliteration',
'closing credits title',
'complete title',
'IMAX version',
'informal short title',
'International (Spanish title)',
'Japan (imdb display title)',
'longer version',
'new title',
'original subtitled version',
'pre-release title',
'promotional abbreviation',
'recut version',
'reissue title',
'restored version',
'script title',
'short title',
'(subtitle)',
'TV title',
'working title',
'World-wide (Spanish title)',
]
#ignore english japanese titles
#for movies that are not only from japan
if ['Japan'] != self.get('country', []):
stop_words += [
'Japan (English title)'
]
for t in self.get('alternativeTitles', []):
for type in t[0].split('/'):
type = type.strip()
stop_word = False
for key in stop_words:
if key in type:
stop_word = True
break
if not stop_word:
if not type in types:
types[type] = []
types[type].append(t[1])
titles = {}
for type in types:
for title in types[type]:
if not title in titles:
titles[title] = []
titles[title].append(type)
def select_title(type):
title = types[type][0]
count = 0
if len(types[type]) > 1:
for t in types[type]:
if len(titles[t]) > count:
count = len(titles[t])
title = t
return title
#FIXME: does work in python2.6, possible to import from __future__?
#types = {type: select_title(type) for type in types}
_types = {}
for type in types:
_types[type] = select_title(type)
types = _types
regexps = [
"^.+ \(imdb display title\) \(English title\)$",
"^USA \(imdb display title\)$",
"^International \(English title\)$",
"^International \(English title\)$",
"^UK \(imdb display title\)$",
"^International \(.+\) \(English title\)$",
"^World-wide \(English title\)$",
]
if 'Hong Kong' in self.get('country', []):
regexps += [
"Hong Kong \(English title\)"
]
english_countries = (
'USA', 'UK', 'United States', 'United Kingdom',
'Australia', 'New Zealand'
)
if not list(filter(lambda c: c in english_countries, self.get('country', []))):
regexps += [
"^[^(]+ \(English title\)$",
"^.+ \(.+\) \(English title\)$",
"^USA$",
"^UK$",
"^USA \(.+\)$",
"^UK \(.+\)$",
"^Australia \(.+\)$",
"World-wide \(English title\)",
"\(literal English title\)",
"^International \(.+ title\)$",
"^International \(.+\) \(.+ title\)$",
]
for regexp in regexps:
for type in types:
if re.compile(regexp).findall(type):
#print(types[type], type)
self['internationalTitle'] = types[type]
break
if 'internationalTitle' in self:
break
def cleanup_title(title):
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
if title.startswith("'") and title.endswith("'"):
title = title[1:-1]
title = re.sub('\(\#[.\d]+\)', '', title)
return title.strip()
for t in ('title', 'internationalTitle'):
if t in self:
self[t] = cleanup_title(self[t])
if 'internationalTitle' in self and \
self.get('title', '').lower() == self['internationalTitle'].lower():
del self['internationalTitle']
if 'alternativeTitles' in self:
alt = {}
for t in self['alternativeTitles']:
title = cleanup_title(t[1])
if title not in (self.get('title'), self.get('internationalTitle')):
if title not in alt:
alt[title] = []
for c in t[0].split('/'):
if not '(working title)' in c:
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
if c:
alt[title].append(c)
self['alternativeTitles'] = []
for t in sorted(alt, key=lambda a: sorted(alt[a])):
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
self['alternativeTitles'].append((t, countries))
if not self['alternativeTitles']:
del self['alternativeTitles']
if 'internationalTitle' in self:
self['originalTitle'] = self['title']
self['title'] = self.pop('internationalTitle')
if 'runtime' in self and self['runtime']:
if 'min' in self['runtime']: base=60
else: base=1
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
if 'runtime' in self and not self['runtime']:
del self['runtime']
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'cast' in self:
if isinstance(self['cast'][0], string_types):
self['cast'] = [self['cast']]
self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c):
c = c.replace('(uncredited)', '').strip()
return c
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
for x in self['cast']]
if 'connections' in self:
cc={}
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
self['connections'] = [self['connections']]
for rel, data, _ in self['connections']:
if isinstance(rel, bytes):
rel = rel.decode('utf-8')
#cc[rel] = re.compile('(.*?)').findall(data)
def get_conn(c):
r = {
'id': c[0],
'title': cleanup_title(c[1]),
}
description = c[2].split('
')
if len(description) == 2 and description[-1].strip() != '-':
r['description'] = description[-1].strip()
return r
cc[rel] = list(map(get_conn, re.compile('(.*?)(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc
for key in ('country', 'genre'):
if key in self:
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#0092999
if '_director' in self:
if 'series' in self or 'isSeries' in self:
self['creator'] = self.pop('_director')
else:
del self['_director']
if 'isSeries' in self:
del self['isSeries']
self['isSeries'] = True
if 'episodeTitle' in self:
self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])
if 'series' in self:
series = Imdb(self['series'], timeout=timeout)
self['seriesTitle'] = series['title']
if 'episodeTitle' in self:
self['seriesTitle'] = series['title']
if 'season' in self and 'episode' in self:
self['title'] = "%s (S%02dE%02d) %s" % (
self['seriesTitle'], self['season'], self['episode'], self['episodeTitle'])
else:
self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle'])
self['season'] = 1
self['title'] = self['title'].strip()
if 'director' in self:
self['episodeDirector'] = self['director']
if 'creator' not in series and 'director' in series:
series['creator'] = series['director']
if len(series['creator']) > 10:
series['creator'] = series['director'][:1]
for key in ['creator', 'country']:
if key in series:
self[key] = series[key]
if 'year' in series:
self['seriesYear'] = series['year']
if not 'year' in self:
self['year'] = series['year']
if 'year' in self:
self['episodeYear'] = self['year']
if 'creator' in self:
self['seriesDirector'] = self['creator']
if 'originalTitle' in self:
del self['originalTitle']
else:
for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
if key in self:
del self[key]
if 'creator' in self:
if 'director' in self:
self['episodeDirector'] = self['director']
self['director'] = self['creator']
#make lists unique but keep order
for key in ('director', 'language'):
if key in self:
self[key] = [x for i,x in enumerate(self[key])
if x not in self[key][i+1:]]
for key in ('actor', 'writer', 'producer', 'editor', 'composer'):
if key in self:
if isinstance(self[key][0], list):
self[key] = [i[0] for i in self[key] if i]
self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
if 'budget' in self and 'gross' in self:
self['profit'] = self['gross'] - self['budget']
if 'releasedate' in self:
def parse_date(d):
try:
d = datetime.strptime(d, '%d %B %Y')
except:
try:
d = datetime.strptime(d, '%B %Y')
except:
return 'x'
return '%d-%02d-%02d' % (d.year, d.month, d.day)
self['releasedate'] = min([
parse_date(d) for d in self['releasedate']
])
if self['releasedate'] == 'x':
del self['releasedate']
if 'summary' in self:
if isinstance(self['summary'], list):
self['summary'] = self['summary'][0]
self['summary'] = self['summary'].split('(.*?).*?(.*?) | .*?(.*?) | ', re.DOTALL).findall(d[1])
]
] for d in self['credits'] if d
]
credits = [c for c in credits if c[1]]
self['credits'] = []
for department, crew in credits:
department = department.replace('(in alphabetical order)', '').strip()
for c in crew:
self['credits'].append({
'name': c[0],
'roles': c[1],
'deparment': department
})
if not self['credits']:
del self['credits']
class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1):
_regex = {}
for key in self.regex:
if self.regex[key]['page'] in ('combined', 'releaseinfo'):
_regex[key] = self.regex[key]
self.regex = _regex
super(ImdbCombined, self).__init__(id, timeout)
def get_movie_by_title(title, timeout=-1):
'''
This only works for exact title matches from the data dump
Usually in the format
Title (Year)
"Series Title" (Year) {(#Season.Episode)}
"Series Title" (Year) {Episode Title (#Season.Episode)}
If there is more than one film with that title for the year
Title (Year/I)
>>> str(get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}'))
'1602860'
>>> str(get_movie_by_title(u'The Matrix (1999)'))
'0133093'
>>> str(get_movie_by_title(u'Little Egypt (1951)'))
'0043748'
>>> str(get_movie_by_title(u'Little Egypt (1897/I)'))
'0214882'
>>> get_movie_by_title(u'Little Egypt')
None
>>> str(get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}'))
'0866567'
'''
params = {'s': 'tt', 'q': title}
if not isinstance(title, bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
params['q'] = params['q'].encode('utf-8')
params = urlencode(params)
url = "http://akas.imdb.com/find?" + params
data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page
r = ''
results = re.compile(r).findall(data)
if results:
return results[0]
return None
def get_movie_id(title, director='', year='', timeout=-1):
'''
>>> str(get_movie_id('The Matrix'))
'0133093'
>>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard'))
'0060304'
>>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967'))
'0060304'
>>> str(get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", u'Jean-Luc Godard'))
'0179214'
>>> str(get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", u'Jean-Luc Godard'))
'0179214'
'''
imdbId = {
(u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
(u'Wings', u'Larisa Shepitko'): '0061196',
(u'The Ascent', u'Larisa Shepitko'): '0075404',
(u'Fanny and Alexander', u'Ingmar Bergman'): '0083922',
(u'Torment', u'Alf Sj\xf6berg'): '0036914',
(u'Crisis', u'Ingmar Bergman'): '0038675',
(u'To Joy', u'Ingmar Bergman'): '0043048',
(u'Humain, trop humain', u'Louis Malle'): '0071635',
(u'Place de la R\xe9publique', u'Louis Malle'): '0071999',
(u'God\u2019s Country', u'Louis Malle'): '0091125',
(u'Flunky, Work Hard', u'Mikio Naruse'): '0022036',
(u'The Courtesans of Bombay', u'Richard Robbins') : '0163591',
(u'Je tu il elle', u'Chantal Akerman') : '0071690',
(u'Hotel Monterey', u'Chantal Akerman') : '0068725',
(u'No Blood Relation', u'Mikio Naruse') : '023261',
(u'Apart from You', u'Mikio Naruse') : '0024214',
(u'Every-Night Dreams', u'Mikio Naruse') : '0024793',
(u'Street Without End', u'Mikio Naruse') : '0025338',
(u'Sisters of the Gion', u'Kenji Mizoguchi') : '0027672',
(u'Osaka Elegy', u'Kenji Mizoguchi') : '0028021',
(u'Blaise Pascal', u'Roberto Rossellini') : '0066839',
(u'Japanese Girls at the Harbor', u'Hiroshi Shimizu') : '0160535',
(u'The Private Life of Don Juan', u'Alexander Korda') : '0025681',
(u'Last Holiday', u'Henry Cass') : '0042665',
(u'A Colt Is My Passport', u'Takashi Nomura') : '0330536',
(u'Androcles and the Lion', u'Chester Erskine') : '0044355',
(u'Major Barbara', u'Gabriel Pascal') : '0033868',
(u'Come On Children', u'Allan King') : '0269104',
(u'Jimi Plays Monterey & Shake! Otis at Monterey', u'D. A. Pennebaker and Chris Hegedus') : '',
(u'Martha Graham: Dance on Film', u'Nathan Kroll') : '',
(u'Carmen', u'Carlos Saura'): '0085297',
(u'The Story of a Cheat', u'Sacha Guitry'): '0028201',
(u'Weekend', 'Andrew Haigh'): '1714210',
}.get((title, director), None)
if imdbId:
return imdbId
params = {'s': 'tt', 'q': title}
if director:
params['q'] = u'"%s" %s' % (title, director)
if year:
params['q'] = u'"%s (%s)" %s' % (title, year, director)
google_query = "site:imdb.com %s" % params['q']
if not isinstance(params['q'], bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
params['q'] = params['q'].encode('utf-8')
params = urlencode(params)
url = "http://akas.imdb.com/find?" + params
#print url
data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page
r = ''
results = re.compile(r).findall(data)
if results:
return results[0]
#otherwise get first result
r = '.*?>> get_movie_poster('0133093')
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
'''
info = ImdbCombined(imdbId)
if 'posterId' in info:
poster = info['posterId']
if '@._V' in poster:
poster = poster.split('@._V')[0] + '@.jpg'
return poster
elif 'series' in info:
return get_movie_poster(info['series'])
return ''
def get_episodes(imdbId, season=None):
episodes = {}
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
if season:
url += '?season=%d' % season
data = cache.read_url(url)
for e in re.compile('.*? S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
else:
data = cache.read_url(url)
match = re.compile('Season (\d+)').findall(data)
if match:
for season in range(1, int(match[0]) + 1):
episodes.update(get_episodes(imdbId, season))
return episodes
def max_votes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = cache.read_url(url).decode('utf-8', 'ignore')
votes = max([
int(v.replace(',', ''))
for v in re.compile(' |