update imdb parser, ticket #3068

This commit is contained in:
j 2018-01-14 18:24:29 +01:00
parent 2d4bf9212a
commit e480b8dcbf
2 changed files with 119 additions and 184 deletions

View file

@ -27,6 +27,52 @@ def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_
def get_url(id): def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id return "http://www.imdb.com/title/tt%s/" % id
def reference_section(id):
return {
'page': 'reference',
're': [
'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
}
def zebra_list(label, more=None):
conditions = {
'page': 'reference',
're': [
label + '</td>.*?<ul(.*?)</ul>',
'<li.*?>(.*?)</li>'
],
'type': 'list',
}
if more:
conditions['re'] += more
return conditions
def zebra_table(label, more=None, type='string'):
conditions = {
'page': 'reference',
're': [
'_label">' + label + '</td>.*?<td>(.*?)</td>',
],
'type': type,
}
if more:
conditions['re'] += more
return conditions
'''
'posterIds': {
'page': 'posters',
're': '/unknown-thumbnail/media/rm(.*?)/tt',
'type': 'list'
},
'''
class Imdb(SiteParser): class Imdb(SiteParser):
''' '''
>>> Imdb('0068646')['title'] == text_type(u'The Godfather') >>> Imdb('0068646')['title'] == text_type(u'The Godfather')
@ -45,49 +91,29 @@ class Imdb(SiteParser):
'type': 'list' 'type': 'list'
}, },
'aspectratio': { 'aspectratio': {
'page': 'combined', 'page': 'reference',
're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)', 're': 'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.]+)',
'type': 'float', 'type': 'float',
}, },
'budget': { 'budget': zebra_table('Budget', more=[
'page': 'business',
're': [
'<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: find_re(decode_html(data).replace(',', ''), '\d+') lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
], ], type='int'),
'type': 'int'
},
'cast': { 'cast': {
'page': 'combined', 'page': 'reference',
're': [ 're': [
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>', ' <table class="cast_list">(.*?)</table>',
'<td.*?itemprop="actor".*?>.*?>(.*?)</a>.*?<td class="character">(.*?)</td>',
lambda ll: [strip_tags(l) for l in ll] lambda ll: [strip_tags(l) for l in ll]
], ],
'type': 'list' 'type': 'list'
}, },
'cinematographer': { 'cinematographer': reference_section('cinematographers'),
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Cinematography by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'connections': { 'connections': {
'page': 'movieconnections', 'page': 'movieconnections',
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)', 're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
'type': 'list' 'type': 'list'
}, },
'country': { 'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
'page': 'combined',
're': [
'<div class="info"><h5>Country:</h5>.*?<div class="info">',
#'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
'<a.*?>(.*?)</a>',
],
'type': 'list'
},
'creator': { 'creator': {
'page': '', 'page': '',
're': [ 're': [
@ -97,44 +123,12 @@ class Imdb(SiteParser):
], ],
'type': 'list' 'type': 'list'
}, },
'director': { 'director': reference_section('directors'),
'page': 'combined', 'editor': reference_section('editors'),
're': [ 'composer': reference_section('composers'),
lambda data: data.split('<b>Series Crew</b>')[0],
'Directed by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'_director': {
'page': 'combined',
're': [
'<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'editor': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Film Editing by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'composer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Original Music by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'episodeTitle': { 'episodeTitle': {
'page': 'combined', 'page': 'reference',
're': '<div id="tn15title">.*?<em>(.*?)</em>', 're': '<h3 itemprop="name">(.*?)<',
'type': 'string' 'type': 'string'
}, },
'filmingLocations': { 'filmingLocations': {
@ -145,77 +139,44 @@ class Imdb(SiteParser):
], ],
'type': 'list' 'type': 'list'
}, },
'genre': { 'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>', lambda x: x[0]]),
'page': 'combined', 'gross': zebra_table('Cumulative Worldwide Gross', more=[
're': [ lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
'<h5>Genre:</h5>(.*?)<hr', ], type='int'),
'<a href="/Sections/Genres/.*?/">(.*?)</a>'
],
'type': 'list'
},
'gross': {
'page': 'business',
're': [
'<h5>Gross</h5>\s*?\$(.*?)<br',
lambda data: find_re(data.replace(',', ''), '\d+')
],
'type': 'int'
},
'keyword': { 'keyword': {
'page': 'keywords', 'page': 'keywords',
're': '<a href="/keyword/.*?>(.*?)</a>', 're': '<a href="/keyword/.*?>(.*?)</a>',
'type': 'list' 'type': 'list'
}, },
'language': { 'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
'page': 'combined',
're': [
#'<h5>Language:</h5>.*?<div class="info">',
'<h5>Language:</h5>.*?</div>',
#'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
'<a.*?>(.*?)</a>',
],
'type': 'list'
},
'originalTitle': { 'originalTitle': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': '<td>\(original title\)</td>\s*<td>(.*?)</td>', 're': '<td>\(original title\)</td>\s*<td>(.*?)</td>',
'type': 'string' 'type': 'string'
}, },
'summary': { 'summary': zebra_table('Plot Summary', more=[
'page': 'plotsummary', '<p>(.*?)<em'
're': '<p class="plotSummary">(.*?)<\/p>', ]),
'type': 'string'
},
'posterId': { 'posterId': {
'page': 'combined', 'page': 'reference',
're': '<img.*?id="primary-poster".*?src="(.*?)".*?>', 're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
'type': 'string' 'type': 'string'
}, },
'posterIds': { 'producer': reference_section('producers'),
'page': 'posters',
're': '/unknown-thumbnail/media/rm(.*?)/tt',
'type': 'list'
},
'producer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Produced by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'productionCompany': { 'productionCompany': {
'page': 'combined', 'page': 'reference',
're': [ 're': [
'Production Companies</b><ul>(.*?)</ul>', 'Production Companies.*?<ul(.*?)</ul>',
'<a href="/company/.*?/">(.*?)</a>' '<a href="/company/.*?/">(.*?)</a>'
], ],
'type': 'list' 'type': 'list'
}, },
'rating': { 'rating': {
'page': 'combined', 'page': 'reference',
're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>', 're': [
'<div class="ipl-rating-star ">(.*?)</div>',
'ipl-rating-star__rating">([\d,.]+?)</span>',
],
'type': 'float' 'type': 'float'
}, },
'releasedate': { 'releasedate': {
@ -226,59 +187,43 @@ class Imdb(SiteParser):
], ],
'type': 'list' 'type': 'list'
}, },
'reviews': { #FIXME using some /offsite/ redirect now
'page': 'externalreviews', #'reviews': {
're': [ # 'page': 'externalreviews',
'<ol>(.*?)</ol>', # 're': [
'<li><a href="(http.*?)".*?>(.*?)</a></li>' # '<ul class="simpleList">(.*?)</ul>',
], # '<li>.*?<a href="(http.*?)".*?>(.*?)</a>.*?</li>'
'type': 'list' # ],
}, # 'type': 'list'
'runtime': { #},
'page': 'combined', 'runtime': zebra_list('Runtime'),
're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>', 'color': zebra_list('Color', more=['<a.*?>(.*?)</a>']),
'type': 'string' 'sound': zebra_list('Sound Mix', more=['<a.*?>(.*?)</a>', lambda x: x[0]]),
},
'color': {
'page': 'combined',
're': [
'<h5>Color:</h5><div class="info-content">(.*?)</div>',
'<a.*?>(.*?)</a>'
],
'type': 'list'
},
'sound': {
'page': 'combined',
're': [
'<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
'<a.*?>(.*?)</a>'
],
'type': 'list'
},
'season': { 'season': {
'page': 'combined', 'page': 'reference',
're': [ 're': [
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>', '<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
'\(Season (\d+), Episode \d+\)', 'Season (\d+)',
], ],
'type': 'int' 'type': 'int'
}, },
'episode': { 'episode': {
'page': 'combined', 'page': 'reference',
're': [ 're': [
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>', '<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
'\(Season \d+, Episode (\d+)\)', 'Episode (\d+)',
], ],
'type': 'int' 'type': 'int'
}, },
'series': { 'series': {
'page': 'combined', 'page': 'reference',
're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})', 're': '<h4 itemprop="name">.*?<a href="/title/tt(\d{7})',
'type': 'string' 'type': 'string'
}, },
'isSeries': { 'isSeries': {
'page': 'combined', 'page': 'reference',
're': '<span class="tv-extra">(TV series|TV mini-series) ', 're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
'type': 'string' 'type': 'string'
}, },
'title': { 'title': {
@ -295,22 +240,17 @@ class Imdb(SiteParser):
'type': 'list', 'type': 'list',
}, },
'votes': { 'votes': {
'page': 'combined', 'page': 'reference',
're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>', 're': [
'class="ipl-rating-star__total-votes">\((.*?)\)',
lambda r: r.replace(',', '')
],
'type': 'string' 'type': 'string'
}, },
'writer': { 'writer': reference_section('writers'),
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Writing credits</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'year': { 'year': {
'page': 'combined', 'page': 'reference',
're': '="og:title" content="[^"]*?\((\d{4}).*?"', 're': '=["\']og:title["\'] content="[^"]*?\((\d{4}).*?"',
'type': 'int' 'type': 'int'
}, },
'credits': { 'credits': {
@ -335,7 +275,7 @@ class Imdb(SiteParser):
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
super(Imdb, self).__init__(timeout) super(Imdb, self).__init__(timeout)
url = self.baseUrl + 'combined' url = self.baseUrl + 'reference'
page = self.read_url(url, timeout=-1) page = self.read_url(url, timeout=-1)
if '<title>IMDb: Page not found</title>' in page \ if '<title>IMDb: Page not found</title>' in page \
or 'The requested URL was not found on our server.' in page: or 'The requested URL was not found on our server.' in page:
@ -353,8 +293,6 @@ class Imdb(SiteParser):
if 'country' in self: if 'country' in self:
self['country'] = [normalize_country_name(c) or c for c in self['country']] self['country'] = [normalize_country_name(c) or c for c in self['country']]
if 'sound' in self:
self['sound'] = list(set(self['sound']))
def cleanup_title(title): def cleanup_title(title):
if title.startswith('"') and title.endswith('"'): if title.startswith('"') and title.endswith('"'):
@ -389,6 +327,8 @@ class Imdb(SiteParser):
del self['alternativeTitles'] del self['alternativeTitles']
if 'runtime' in self and self['runtime']: if 'runtime' in self and self['runtime']:
if isinstance(self['runtime'], list):
self['runtime'] = self['runtime'][0]
if 'min' in self['runtime']: if 'min' in self['runtime']:
base = 60 base = 60
else: else:
@ -396,8 +336,9 @@ class Imdb(SiteParser):
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
if 'runtime' in self and not self['runtime']: if 'runtime' in self and not self['runtime']:
del self['runtime'] del self['runtime']
if 'votes' in self:
self['votes'] = self['votes'].replace(',', '') if 'sound' in self:
self['sound'] = list(sorted(set(self['sound'])))
if 'cast' in self: if 'cast' in self:
if isinstance(self['cast'][0], string_types): if isinstance(self['cast'][0], string_types):
@ -405,6 +346,7 @@ class Imdb(SiteParser):
self['actor'] = [c[0] for c in self['cast']] self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c): def cleanup_character(c):
c = c.replace('(uncredited)', '').strip() c = c.replace('(uncredited)', '').strip()
c = re.sub('\s+', ' ', c)
return c return c
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])} self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
for x in self['cast']] for x in self['cast']]
@ -428,18 +370,11 @@ class Imdb(SiteParser):
return r return r
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data))) cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc self['connections'] = cc
for key in ('country', 'genre'): for key in ('country', 'genre'):
if key in self: if key in self:
self[key] = list(filter(lambda x: x.lower() != 'home', self[key])) self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#0092999
if '_director' in self:
if 'series' in self or 'isSeries' in self:
self['creator'] = self.pop('_director')
else:
del self['_director']
if 'isSeries' in self: if 'isSeries' in self:
del self['isSeries'] del self['isSeries']
self['isSeries'] = True self['isSeries'] = True
@ -558,7 +493,7 @@ class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
_regex = {} _regex = {}
for key in self.regex: for key in self.regex:
if self.regex[key]['page'] in ('combined', 'releaseinfo'): if self.regex[key]['page'] in ('releaseinfo', 'reference'):
_regex[key] = self.regex[key] _regex[key] = self.regex[key]
self.regex = _regex self.regex = _regex
super(ImdbCombined, self).__init__(id, timeout) super(ImdbCombined, self).__init__(id, timeout)

View file

@ -33,7 +33,7 @@ class SiteParser(dict):
return "%s%s" % (self.baseUrl, page) return "%s%s" % (self.baseUrl, page)
def read_url(self, url, timeout): def read_url(self, url, timeout):
if not url in self._cache: if url not in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True) self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url] return self._cache[url]