From e480b8dcbf23aba730717e30dff5c458d5007967 Mon Sep 17 00:00:00 2001 From: j Date: Sun, 14 Jan 2018 18:24:29 +0100 Subject: [PATCH] update imdb parser, ticket #3068 --- ox/web/imdb.py | 301 +++++++++++++++++-------------------------- ox/web/siteparser.py | 2 +- 2 files changed, 119 insertions(+), 184 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index c766ecf..b63dc69 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -27,6 +27,52 @@ def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_ def get_url(id): return "http://www.imdb.com/title/tt%s/" % id + +def reference_section(id): + return { + 'page': 'reference', + 're': [ + '

'.format(id=id), + '' + label + '.*?(.*?)', + ], + 'type': type, + } + if more: + conditions['re'] += more + return conditions + + +''' +'posterIds': { + 'page': 'posters', + 're': '/unknown-thumbnail/media/rm(.*?)/tt', + 'type': 'list' +}, +''' + class Imdb(SiteParser): ''' >>> Imdb('0068646')['title'] == text_type(u'The Godfather') @@ -45,49 +91,29 @@ class Imdb(SiteParser): 'type': 'list' }, 'aspectratio': { - 'page': 'combined', - 're': 'Aspect Ratio:

([\d\.]+)', + 'page': 'reference', + 're': 'Aspect Ratio.*?ipl-inline-list__item">\s+([\d\.]+)', 'type': 'float', }, - 'budget': { - 'page': 'business', - 're': [ - '
Budget
\s*?\$(.*?).*?>(.*?)
.*?(.*?)', + ' (.*?)
', + '.*?>(.*?).*?(.*?)', lambda ll: [strip_tags(l) for l in ll] - ], - 'type': 'list' - }, - 'cinematographer': { - 'page': 'combined', - 're': [ - lambda data: data.split('Series Crew')[0], - 'Cinematography by(.*?)', - '(.*?)' ], 'type': 'list' }, + 'cinematographer': reference_section('cinematographers'), 'connections': { 'page': 'movieconnections', 're': '

(.*?)

(.*?)(<\/div>\n
Country:
.*?
', - #'(.*?)', #links changed to work with existing caches, just take all links - '(.*?)', - ], - 'type': 'list' - }, + 'country': zebra_list('Country', more=['(.*?)']), 'creator': { 'page': '', 're': [ @@ -97,44 +123,12 @@ class Imdb(SiteParser): ], 'type': 'list' }, - 'director': { - 'page': 'combined', - 're': [ - lambda data: data.split('Series Crew')[0], - 'Directed by(.*?)', - '(.*?)
', - '(.*?)' - ], - 'type': 'list' - }, - 'composer': { - 'page': 'combined', - 're': [ - lambda data: data.split('Series Crew')[0], - 'Original Music by(.*?)', - '.*?(.*?)', + 'page': 'reference', + 're': '

(.*?)<', 'type': 'string' }, 'filmingLocations': { @@ -145,77 +139,44 @@ class Imdb(SiteParser): ], 'type': 'list' }, - 'genre': { - 'page': 'combined', - 're': [ - '

Genre:
(.*?)(.*?)
' - ], - 'type': 'list' - }, - 'gross': { - 'page': 'business', - 're': [ - '
Gross
\s*?\$(.*?)(.*?)', lambda x: x[0]]), + 'gross': zebra_table('Cumulative Worldwide Gross', more=[ + lambda data: find_re(decode_html(data).replace(',', ''), '\d+') + ], type='int'), 'keyword': { 'page': 'keywords', 're': '', - '
Language:
.*?
', - #'(.*?)', #links changed to work with existing caches, just take all links - '(.*?)', - ], - 'type': 'list' - }, + 'language': zebra_list('Language', more=['(.*?)']), 'originalTitle': { 'page': 'releaseinfo', 're': '\(original title\)\s*(.*?)', 'type': 'string' }, - 'summary': { - 'page': 'plotsummary', - 're': '

(.*?)<\/p>', - 'type': 'string' - }, + 'summary': zebra_table('Plot Summary', more=[ + '

(.*?)', + 'page': 'reference', + 're': '', 'type': 'string' }, - 'posterIds': { - 'page': 'posters', - 're': '/unknown-thumbnail/media/rm(.*?)/tt', - 'type': 'list' - }, - 'producer': { - 'page': 'combined', - 're': [ - lambda data: data.split('Series Crew')[0], - 'Produced by(.*?)', - '(.*?)' - ], - 'type': 'list' - }, + 'producer': reference_section('producers'), 'productionCompany': { - 'page': 'combined', + 'page': 'reference', 're': [ - 'Production Companies

    (.*?)
', + 'Production Companies.*?', '(.*?)' ], 'type': 'list' }, 'rating': { - 'page': 'combined', - 're': '
.*?([\d,.]+?)/10', + 'page': 'reference', + 're': [ + '
(.*?)
', + 'ipl-rating-star__rating">([\d,.]+?)', + ], 'type': 'float' }, 'releasedate': { @@ -226,59 +187,43 @@ class Imdb(SiteParser): ], 'type': 'list' }, - 'reviews': { - 'page': 'externalreviews', - 're': [ - '
    (.*?)
', - '
  • (.*?)
  • ' - ], - 'type': 'list' - }, - 'runtime': { - 'page': 'combined', - 're': '
    Runtime:
    .*?([0-9]+ sec|[0-9]+ min).*?
    ', - 'type': 'string' - }, - 'color': { - 'page': 'combined', - 're': [ - '
    Color:
    (.*?)
    ', - '(.*?)' - ], - 'type': 'list' - }, - 'sound': { - 'page': 'combined', - 're': [ - '
    Sound Mix:
    (.*?)
    ', - '(.*?)' - ], - 'type': 'list' - }, + #FIXME using some /offsite/ redirect now + #'reviews': { + # 'page': 'externalreviews', + # 're': [ + # '
      (.*?)
    ', + # '
  • .*?(.*?).*?
  • ' + # ], + # 'type': 'list' + #}, + 'runtime': zebra_list('Runtime'), + 'color': zebra_list('Color', more=['(.*?)']), + 'sound': zebra_list('Sound Mix', more=['(.*?)', lambda x: x[0]]), + 'season': { - 'page': 'combined', + 'page': 'reference', 're': [ - '
    Original Air Date:
    .*?
    (.*?)
    ', - '\(Season (\d+), Episode \d+\)', + '
      (.*?)
    ', + 'Season (\d+)', ], 'type': 'int' }, 'episode': { - 'page': 'combined', + 'page': 'reference', 're': [ - '
    Original Air Date:
    .*?
    (.*?)
    ', - '\(Season \d+, Episode (\d+)\)', + '
      (.*?)
    ', + 'Episode (\d+)', ], 'type': 'int' }, 'series': { - 'page': 'combined', - 're': '
    TV Series:
    .*?.*?(TV series|TV mini-series) ', + 'page': 'reference', + 're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"', 'type': 'string' }, 'title': { @@ -295,22 +240,17 @@ class Imdb(SiteParser): 'type': 'list', }, 'votes': { - 'page': 'combined', - 're': '([\d,]*?) votes', + 'page': 'reference', + 're': [ + 'class="ipl-rating-star__total-votes">\((.*?)\)', + lambda r: r.replace(',', '') + ], 'type': 'string' }, - 'writer': { - 'page': 'combined', - 're': [ - lambda data: data.split('Series Crew')[0], - 'Writing credits(.*?)', - '(.*?)' - ], - 'type': 'list' - }, + 'writer': reference_section('writers'), 'year': { - 'page': 'combined', - 're': '="og:title" content="[^"]*?\((\d{4}).*?"', + 'page': 'reference', + 're': '=["\']og:title["\'] content="[^"]*?\((\d{4}).*?"', 'type': 'int' }, 'credits': { @@ -335,7 +275,7 @@ class Imdb(SiteParser): self.baseUrl = "http://www.imdb.com/title/tt%s/" % id super(Imdb, self).__init__(timeout) - url = self.baseUrl + 'combined' + url = self.baseUrl + 'reference' page = self.read_url(url, timeout=-1) if 'IMDb: Page not found' in page \ or 'The requested URL was not found on our server.' in page: @@ -353,8 +293,6 @@ class Imdb(SiteParser): if 'country' in self: self['country'] = [normalize_country_name(c) or c for c in self['country']] - if 'sound' in self: - self['sound'] = list(set(self['sound'])) def cleanup_title(title): if title.startswith('"') and title.endswith('"'): @@ -389,6 +327,8 @@ class Imdb(SiteParser): del self['alternativeTitles'] if 'runtime' in self and self['runtime']: + if isinstance(self['runtime'], list): + self['runtime'] = self['runtime'][0] if 'min' in self['runtime']: base = 60 else: @@ -396,8 +336,9 @@ class Imdb(SiteParser): self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base if 'runtime' in self and not self['runtime']: del self['runtime'] - if 'votes' in self: - self['votes'] = self['votes'].replace(',', '') + + if 'sound' in self: + self['sound'] = list(sorted(set(self['sound']))) if 'cast' in self: if isinstance(self['cast'][0], string_types): @@ -405,6 +346,7 @@ class Imdb(SiteParser): self['actor'] = [c[0] for c in self['cast']] def cleanup_character(c): c = c.replace('(uncredited)', '').strip() + c = re.sub('\s+', ' ', c) return c self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])} for x in self['cast']] @@ -428,18 +370,11 @@ class Imdb(SiteParser): return r cc[rel] = list(map(get_conn, re.compile('(.*?)(.*?)<\/div', re.DOTALL).findall(data))) - self['connections'] = cc for key in ('country', 'genre'): if key in self: self[key] = list(filter(lambda x: x.lower() != 'home', self[key])) - #0092999 - if '_director' in self: - if 'series' in self or 'isSeries' in self: - self['creator'] = self.pop('_director') - else: - del self['_director'] if 'isSeries' in self: del self['isSeries'] self['isSeries'] = True @@ -558,7 +493,7 @@ class ImdbCombined(Imdb): def __init__(self, id, timeout=-1): _regex = {} for key in self.regex: - if self.regex[key]['page'] in ('combined', 'releaseinfo'): + if self.regex[key]['page'] in ('releaseinfo', 'reference'): _regex[key] = self.regex[key] self.regex = _regex super(ImdbCombined, self).__init__(id, timeout) diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index fa21948..61a79bd 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -33,7 +33,7 @@ class SiteParser(dict): return "%s%s" % (self.baseUrl, page) def read_url(self, url, timeout): - if not url in self._cache: + if url not in self._cache: self._cache[url] = read_url(url, timeout=timeout, unicode=True) return self._cache[url]