cleanup imdb data

This commit is contained in:
j 2011-10-15 16:54:09 +02:00
parent 0bac09ba51
commit e756062749

View file

@ -32,7 +32,7 @@ class Imdb(SiteParser):
u'The Matrix' u'The Matrix'
''' '''
regex = { regex = {
'alternative_titles': { 'alternativeTitles': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': [ 're': [
'name="akas".*?<table.*?>(.*?)</table>', 'name="akas".*?<table.*?>(.*?)</table>',
@ -41,7 +41,7 @@ class Imdb(SiteParser):
'type': 'list' 'type': 'list'
}, },
'aspectratio': { 'aspectRatio': {
'page': 'combined', 'page': 'combined',
're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)', 're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
'type': 'float', 'type': 'float',
@ -62,7 +62,7 @@ class Imdb(SiteParser):
], ],
'type': 'list' 'type': 'list'
}, },
'cinematographers': { 'cinematographer': {
'page': 'combined', 'page': 'combined',
're': [ 're': [
lambda data: data.split('Series Crew')[0], lambda data: data.split('Series Crew')[0],
@ -76,7 +76,7 @@ class Imdb(SiteParser):
're': '<h5>(.*?)</h5>(.*?)\n\n', 're': '<h5>(.*?)</h5>(.*?)\n\n',
'type': 'list' 'type': 'list'
}, },
'countries': { 'country': {
'page': 'combined', 'page': 'combined',
're': [ 're': [
'<div class="info"><h5>Country:</h5>.*?<div class="info">', '<div class="info"><h5>Country:</h5>.*?<div class="info">',
@ -85,7 +85,7 @@ class Imdb(SiteParser):
], ],
'type': 'list' 'type': 'list'
}, },
'creators': { 'creator': {
'page': 'combined', 'page': 'combined',
're': [ 're': [
'<h5>Creators:</h5>.*?<div class="info-content">(.*?)</div>', '<h5>Creators:</h5>.*?<div class="info-content">(.*?)</div>',
@ -93,7 +93,7 @@ class Imdb(SiteParser):
], ],
'type': 'list' 'type': 'list'
}, },
'directors': { 'director': {
'page': 'combined', 'page': 'combined',
're': [ 're': [
lambda data: data.split('Series Crew')[0], lambda data: data.split('Series Crew')[0],
@ -102,7 +102,7 @@ class Imdb(SiteParser):
], ],
'type': 'list' 'type': 'list'
}, },
'editors': { 'editor': {
'page': 'combined', 'page': 'combined',
're': [ 're': [
lambda data: data.split('Series Crew')[0], lambda data: data.split('Series Crew')[0],
@ -111,17 +111,17 @@ class Imdb(SiteParser):
], ],
'type': 'list' 'type': 'list'
}, },
'episode_title': { 'episodeTitle': {
'page': 'combined', 'page': 'combined',
're': '<div id="tn15title">.*?<em>(.*?)</em>', 're': '<div id="tn15title">.*?<em>(.*?)</em>',
'type': 'string' 'type': 'string'
}, },
'filming_locations': { 'filmingLocations': {
'page': 'locations', 'page': 'locations',
're': '<a href="/search/title\?locations=.*?">(.*?)</a>', 're': '<a href="/search/title\?locations=.*?">(.*?)</a>',
'type': 'list' 'type': 'list'
}, },
'genres': { 'genre': {
'page': 'combined', 'page': 'combined',
're': '<a href="/Sections/Genres/.*?/">(.*?)</a>', 're': '<a href="/Sections/Genres/.*?/">(.*?)</a>',
'type': 'list' 'type': 'list'
@ -139,7 +139,7 @@ class Imdb(SiteParser):
're': '<a href="/keyword/.*?/">(.*?)</a>', 're': '<a href="/keyword/.*?/">(.*?)</a>',
'type': 'list' 'type': 'list'
}, },
'languages': { 'language': {
'page': 'combined', 'page': 'combined',
're': [ 're': [
'<div class="info"><h5>Language:</h5>.*?<div class="info">', '<div class="info"><h5>Language:</h5>.*?<div class="info">',
@ -148,22 +148,22 @@ class Imdb(SiteParser):
], ],
'type': 'list' 'type': 'list'
}, },
'plot': { 'summary': {
'page': 'plotsummary', 'page': 'plotsummary',
're': '</div>.*?<p class="plotpar">(.*?)<i>', 're': '</div>.*?<p class="plotpar">(.*?)<i>',
'type': 'string' 'type': 'string'
}, },
'poster_id': { 'posterId': {
'page': 'combined', 'page': 'combined',
're': '/primary-photo/media/rm(.*?)/tt', 're': '/primary-photo/media/rm(.*?)/tt',
'type': 'string' 'type': 'string'
}, },
'poster_ids': { 'posterIds': {
'page': 'posters', 'page': 'posters',
're': '/unknown-thumbnail/media/rm(.*?)/tt', 're': '/unknown-thumbnail/media/rm(.*?)/tt',
'type': 'list' 'type': 'list'
}, },
'producers': { 'producer': {
'page': 'combined', 'page': 'combined',
're': [ 're': [
lambda data: data.split('Series Crew')[0], lambda data: data.split('Series Crew')[0],
@ -177,7 +177,7 @@ class Imdb(SiteParser):
're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>', 're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
'type': 'float' 'type': 'float'
}, },
'release date': { 'releaseDate': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': '<a href="/date/(\d{2})-(\d{2})/">.*?</a> <a href="/year/(\d{4})/">', 're': '<a href="/date/(\d{2})-(\d{2})/">.*?</a> <a href="/year/(\d{4})/">',
'type': 'date' 'type': 'date'
@ -216,7 +216,7 @@ class Imdb(SiteParser):
're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})', 're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
'type': 'string' 'type': 'string'
}, },
'original_title': { 'originalTitle': {
'page': 'combined', 'page': 'combined',
're': '<h1>(.*?) <span>', 're': '<h1>(.*?) <span>',
'type': 'string' 'type': 'string'
@ -231,7 +231,7 @@ class Imdb(SiteParser):
're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>', 're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
'type': 'string' 'type': 'string'
}, },
'writers': { 'writer': {
'page': 'combined', 'page': 'combined',
're': [ 're': [
lambda data: data.split('Series Crew')[0], lambda data: data.split('Series Crew')[0],
@ -274,23 +274,23 @@ class Imdb(SiteParser):
#fails if orignial is english... Japan (English title) #fails if orignial is english... Japan (English title)
#if 'english title' in t[1].lower(): return True #if 'english title' in t[1].lower(): return True
return False return False
ititle = filter(is_international_title, self.get('alternative_titles', [])) ititle = filter(is_international_title, self.get('alternativeTitles', []))
if ititle: if ititle:
self['english_title'] = ititle[0][0] self['englishTitle'] = ititle[0][0]
self['title'] = self.get('english_title', self['original_title']) self['title'] = self.get('englishTitle', self['originalTitle'])
for t in ('title', 'english_title', 'original_title'): for t in ('title', 'englishTitle', 'originalTitle'):
if t in self and self[t].startswith('"') and self[t].endswith('"'): if t in self and self[t].startswith('"') and self[t].endswith('"'):
self[t] = self[t][1:-1] self[t] = self[t][1:-1]
if 'alternative_titles' in self: if 'alternativeTitles' in self:
if len(self['alternative_titles']) == 2 and \ if len(self['alternativeTitles']) == 2 and \
isinstance(self['alternative_titles'][0], basestring): isinstance(self['alternativeTitles'][0], basestring):
self['alternative_titles'] = [self['alternative_titles']] self['alternativeTitles'] = [self['alternativeTitles']]
self['alternative_titles'] = [[t[0], self['alternativeTitles'] = [[t[0],
t[1].split(' / ')[0].split('(')[0].strip()] t[1].split(' / ')[0].split('(')[0].strip()]
for t in self['alternative_titles']] for t in self['alternativeTitles']]
if 'runtime' in self and self['runtime']: if 'runtime' in self and self['runtime']:
if 'min' in self['runtime']: base=60 if 'min' in self['runtime']: base=60
@ -299,6 +299,13 @@ class Imdb(SiteParser):
if 'runtime' in self and not self['runtime']: if 'runtime' in self and not self['runtime']:
del self['runtime'] del self['runtime']
if 'votes' in self: self['votes'] = self['votes'].replace(',', '') if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'cast' in self:
if isinstance(self['cast'][0], basestring):
self['cast'] = [self['cast']]
self['actor'] = [c[0] for c in self['cast']]
self['cast'] = map(lambda x: {'actor': x[0], 'character': x[1]}, self['cast'])
if 'connections' in self: if 'connections' in self:
cc={} cc={}
if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring): if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring):
@ -318,37 +325,41 @@ class Imdb(SiteParser):
self['connections'] = cc self['connections'] = cc
for key in ('countries', 'genres'): for key in ('country', 'genre'):
if key in self: if key in self:
self[key] = filter(lambda x: x.lower() != 'home', self[key]) self[key] = filter(lambda x: x.lower() != 'home', self[key])
if 'creators' in self: if 'creator' in self:
self['directors'] = self['creators'] self['episodeDirector'] = self['director']
del self['creators'] self['director'] = self['creator']
if 'series' in self: if 'series' in self:
if 'episode_title' in self: if 'episodeTitle' in self:
self['series_title'] = self['title'] self['seriesTitle'] = self['title']
self['title'] = "%s (S01) %s" % (self['series_title'], self['episode_title']) self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle'])
if 'episode_title' in self and 'season' in self and 'episode' in self: if 'episodeTitle' in self and 'season' in self and 'episode' in self:
self['title'] = "%s (S%02dE%02d) %s" % ( self['title'] = "%s (S%02dE%02d) %s" % (
self['series_title'], self['season'], self['episode'], self['episode_title']) self['seriesTitle'], self['season'], self['episode'], self['episodeTitle'])
for key in ('directors', 'year'): for key in ('Director', 'Year'):
if key in self: if key in self:
self['episode_%s'%key] = self[key] self['episode%s'%key] = self[key.lowe()]
series = Imdb(self['series']) series = Imdb(self['series'])
for key in ['directors', 'year']: for key in ['director', 'year']:
if key in series: if key in series:
self[key] =series[key] self[key] = series[key]
if 'original_title' in self: if 'originalTitle' in self:
del self['original_title'] del self['originalTitle']
else: else:
for key in ('series_title', 'episode_title', 'season', 'episode'): for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
if key in self: if key in self:
del self[key] del self[key]
if 'budget' in self and 'gross' in self: if 'budget' in self and 'gross' in self:
self['profit'] = self['gross'] - self['budget'] self['profit'] = self['gross'] - self['budget']
if 'releaseDate' in self:
if isinstance(self['releaseDate'], list):
self['releaseDate'] = min(self['releaseDate'])
class ImdbCombined(Imdb): class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
_regex = {} _regex = {}