update imdb parser, ticket #3068

2018-01-14 18:24:29 +01:00 · 2018-01-14 18:24:29 +01:00 · e480b8dcbf
commit e480b8dcbf
parent 2d4bf9212a
2 changed files with 119 additions and 184 deletions
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -27,6 +27,52 @@ def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_
 def get_url(id):
    return "http://www.imdb.com/title/tt%s/" % id

+
+def reference_section(id):
+    return {
+        'page': 'reference',
+        're': [
+            '<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
+            '<a href="/name/.*?>(.*?)</a>'
+        ],
+        'type': 'list'
+    }
+
+
+def zebra_list(label, more=None):
+    conditions = {
+        'page': 'reference',
+        're': [
+            label + '</td>.*?<ul(.*?)</ul>',
+            '<li.*?>(.*?)</li>'
+        ],
+        'type': 'list',
+    }
+    if more:
+        conditions['re'] += more
+    return conditions
+
+def zebra_table(label, more=None, type='string'):
+    conditions = {
+        'page': 'reference',
+        're': [
+            '_label">' + label + '</td>.*?<td>(.*?)</td>',
+        ],
+        'type': type,
+    }
+    if more:
+        conditions['re'] += more
+    return conditions
+
+
+'''
+'posterIds': {
+    'page': 'posters',
+    're': '/unknown-thumbnail/media/rm(.*?)/tt',
+    'type': 'list'
+},
+'''
+
 class Imdb(SiteParser):
    '''
    >>> Imdb('0068646')['title'] == text_type(u'The Godfather')
@ -45,49 +91,29 @@ class Imdb(SiteParser):
            'type': 'list'
        },
        'aspectratio': {
-            'page': 'combined',
-            're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
+            'page': 'reference',
+            're': 'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.]+)',
            'type': 'float',
        },
-        'budget': {
-            'page': 'business',
-            're': [
-                '<h5>Budget</h5>\s*?\$(.*?)<br',
+        'budget': zebra_table('Budget', more=[
            lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
-            ],
-            'type': 'int'
-        },
+        ], type='int'),
        'cast': {
-            'page': 'combined',
+            'page': 'reference',
            're': [
-                '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
+                ' <table class="cast_list">(.*?)</table>',
+                '<td.*?itemprop="actor".*?>.*?>(.*?)</a>.*?<td class="character">(.*?)</td>',
                lambda ll: [strip_tags(l) for l in ll]
            ],
            'type': 'list'
        },
-        'cinematographer': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('Series Crew')[0],
-                'Cinematography by</a>(.*?)</table>',
-                '<a href="/name/.*?/">(.*?)</a>'
-            ],
-            'type': 'list'
-        },
+        'cinematographer': reference_section('cinematographers'),
        'connections': {
            'page': 'movieconnections',
            're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n  <a|<script)',
            'type': 'list'
        },
-        'country': {
-            'page': 'combined',
-            're': [
-                '<div class="info"><h5>Country:</h5>.*?<div class="info">',
-                #'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
-                '<a.*?>(.*?)</a>',
-            ],
-            'type': 'list'
-        },
+        'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
        'creator': {
            'page': '',
            're': [
@ -97,44 +123,12 @@ class Imdb(SiteParser):
            ],
            'type': 'list'
        },
-        'director': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('<b>Series Crew</b>')[0],
-                'Directed by</a>(.*?)</table>',
-                '<a href="/name/.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
-        '_director': {
-            'page': 'combined',
-            're': [
-                '<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
-                '<a href="/name/.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
-        'editor': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('Series Crew')[0],
-                'Film Editing by</a>(.*?)</table>',
-                '<a href="/name/.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
-        'composer': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('Series Crew')[0],
-                'Original Music by</a>(.*?)</table>',
-                '<a href="/name/.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
+        'director': reference_section('directors'),
+        'editor': reference_section('editors'),
+        'composer': reference_section('composers'),
        'episodeTitle': {
-            'page': 'combined',
-            're': '<div id="tn15title">.*?<em>(.*?)</em>',
+            'page': 'reference',
+            're': '<h3 itemprop="name">(.*?)<',
            'type': 'string'
        },
        'filmingLocations': {
@ -145,77 +139,44 @@ class Imdb(SiteParser):
            ],
            'type': 'list'
        },
-        'genre': {
-            'page': 'combined',
-            're': [
-                '<h5>Genre:</h5>(.*?)<hr',
-                '<a href="/Sections/Genres/.*?/">(.*?)</a>'
-            ],
-            'type': 'list'
-        },
-        'gross': {
-            'page': 'business',
-            're': [
-                '<h5>Gross</h5>\s*?\$(.*?)<br',
-                lambda data: find_re(data.replace(',', ''), '\d+')
-            ],
-            'type': 'int'
-        },
+        'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>', lambda x: x[0]]),
+        'gross': zebra_table('Cumulative Worldwide Gross', more=[
+            lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
+        ], type='int'),
        'keyword': {
            'page': 'keywords',
            're': '<a href="/keyword/.*?>(.*?)</a>',
            'type': 'list'
        },
-        'language': {
-            'page': 'combined',
-            're': [
-                #'<h5>Language:</h5>.*?<div class="info">',
-                '<h5>Language:</h5>.*?</div>',
-                #'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
-                '<a.*?>(.*?)</a>',
-            ],
-            'type': 'list'
-        },
+        'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
        'originalTitle': {
            'page': 'releaseinfo',
            're': '<td>\(original title\)</td>\s*<td>(.*?)</td>',
            'type': 'string'
        },
-        'summary': {
-            'page': 'plotsummary',
-            're': '<p class="plotSummary">(.*?)<\/p>',
-            'type': 'string'
-        },
+        'summary': zebra_table('Plot Summary', more=[
+            '<p>(.*?)<em'
+        ]),
        'posterId': {
-            'page': 'combined',
-            're': '<img.*?id="primary-poster".*?src="(.*?)".*?>',
+            'page': 'reference',
+            're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
            'type': 'string'
        },
-        'posterIds': {
-            'page': 'posters',
-            're': '/unknown-thumbnail/media/rm(.*?)/tt',
-            'type': 'list'
-        },
-        'producer': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('Series Crew')[0],
-                'Produced by</a>(.*?)</table>',
-                '<a href="/name/.*?/">(.*?)</a>'
-            ],
-            'type': 'list'
-        },
+        'producer': reference_section('producers'),
        'productionCompany': {
-            'page': 'combined',
+            'page': 'reference',
            're': [
-                'Production Companies</b><ul>(.*?)</ul>',
+                'Production Companies.*?<ul(.*?)</ul>',
                '<a href="/company/.*?/">(.*?)</a>'
            ],
            'type': 'list'
        },
        'rating': {
-            'page': 'combined',
-            're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
+            'page': 'reference',
+            're': [
+                '<div class="ipl-rating-star ">(.*?)</div>',
+                'ipl-rating-star__rating">([\d,.]+?)</span>',
+            ],
            'type': 'float'
        },
        'releasedate': {
@ -226,59 +187,43 @@ class Imdb(SiteParser):
            ],
            'type': 'list'
        },
-        'reviews': {
-            'page': 'externalreviews',
-            're': [
-                '<ol>(.*?)</ol>',
-                '<li><a href="(http.*?)".*?>(.*?)</a></li>'
-            ],
-            'type': 'list'
-        },
-        'runtime': {
-            'page': 'combined',
-            're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
-            'type': 'string'
-        },
-        'color': {
-            'page': 'combined',
-            're': [
-                '<h5>Color:</h5><div class="info-content">(.*?)</div>',
-                '<a.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
-        'sound': {
-            'page': 'combined',
-            're': [
-                '<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
-                '<a.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
+        #FIXME using some /offsite/ redirect now
+        #'reviews': {
+        #    'page': 'externalreviews',
+        #    're': [
+        #        '<ul class="simpleList">(.*?)</ul>',
+        #        '<li>.*?<a href="(http.*?)".*?>(.*?)</a>.*?</li>'
+        #    ],
+        #    'type': 'list'
+        #},
+        'runtime': zebra_list('Runtime'),
+        'color': zebra_list('Color', more=['<a.*?>(.*?)</a>']),
+        'sound': zebra_list('Sound Mix', more=['<a.*?>(.*?)</a>', lambda x: x[0]]),
+
        'season': {
-            'page': 'combined',
+            'page': 'reference',
            're': [
-                '<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
-                '\(Season (\d+), Episode \d+\)',
+                '<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
+                'Season (\d+)',
             ],
            'type': 'int'
        },
        'episode': {
-            'page': 'combined',
+            'page': 'reference',
            're': [
-                '<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
-                '\(Season \d+, Episode (\d+)\)',
+                '<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
+                'Episode (\d+)',
             ],
            'type': 'int'
        },
        'series': {
-            'page': 'combined',
-            're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
+            'page': 'reference',
+            're': '<h4 itemprop="name">.*?<a href="/title/tt(\d{7})',
            'type': 'string'
        },
        'isSeries': {
-            'page': 'combined',
-            're': '<span class="tv-extra">(TV series|TV mini-series) ',
+            'page': 'reference',
+            're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
            'type': 'string'
        },
        'title': {
@ -295,22 +240,17 @@ class Imdb(SiteParser):
            'type': 'list',
        },
        'votes': {
-            'page': 'combined',
-            're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
+            'page': 'reference',
+            're': [
+                'class="ipl-rating-star__total-votes">\((.*?)\)',
+                lambda r: r.replace(',', '')
+            ],
            'type': 'string'
        },
-        'writer': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('Series Crew')[0],
-                'Writing credits</a>(.*?)</table>',
-                '<a href="/name/.*?/">(.*?)</a>'
-            ],
-            'type': 'list'
-        },
+        'writer': reference_section('writers'),
        'year': {
-            'page': 'combined',
-            're': '="og:title" content="[^"]*?\((\d{4}).*?"',
+            'page': 'reference',
+            're': '=["\']og:title["\'] content="[^"]*?\((\d{4}).*?"',
            'type': 'int'
        },
        'credits': {
@ -335,7 +275,7 @@ class Imdb(SiteParser):
        self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
        super(Imdb, self).__init__(timeout)

-        url = self.baseUrl + 'combined' 
+        url = self.baseUrl + 'reference'
        page = self.read_url(url, timeout=-1)
        if '<title>IMDb: Page not found</title>' in page \
            or 'The requested URL was not found on our server.' in page:
@ -353,8 +293,6 @@ class Imdb(SiteParser):
        if 'country' in self:
            self['country'] = [normalize_country_name(c) or c for c in self['country']]

-        if 'sound' in self:
-            self['sound'] = list(set(self['sound']))

        def cleanup_title(title):
            if title.startswith('"') and title.endswith('"'):
@ -389,6 +327,8 @@ class Imdb(SiteParser):
                del self['alternativeTitles']

        if 'runtime' in self and self['runtime']:
+            if isinstance(self['runtime'], list):
+                self['runtime'] = self['runtime'][0]
            if 'min' in self['runtime']:
                base = 60
            else:
@ -396,8 +336,9 @@ class Imdb(SiteParser):
            self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
        if 'runtime' in self and not self['runtime']:
            del self['runtime']
-        if 'votes' in self:
-            self['votes'] = self['votes'].replace(',', '')
+
+        if 'sound' in self:
+            self['sound'] = list(sorted(set(self['sound'])))

        if 'cast' in self:
            if isinstance(self['cast'][0], string_types):
@ -405,6 +346,7 @@ class Imdb(SiteParser):
            self['actor'] = [c[0] for c in self['cast']]
            def cleanup_character(c):
                c = c.replace('(uncredited)', '').strip()
+                c = re.sub('\s+', ' ', c)
                return c
            self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
                            for x in self['cast']]
@ -428,18 +370,11 @@ class Imdb(SiteParser):
                    return r
                cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))

-
            self['connections'] = cc

        for key in ('country', 'genre'):
            if key in self:
                self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
-        #0092999
-        if '_director' in self:
-            if 'series' in self or 'isSeries' in self:
-                self['creator'] = self.pop('_director')
-            else:
-                del self['_director']
        if 'isSeries' in self:
            del self['isSeries']
            self['isSeries'] = True
@ -558,7 +493,7 @@ class ImdbCombined(Imdb):
    def __init__(self, id, timeout=-1):
        _regex = {}
        for key in self.regex:
-            if self.regex[key]['page'] in ('combined', 'releaseinfo'):
+            if self.regex[key]['page'] in ('releaseinfo', 'reference'):
                _regex[key] = self.regex[key]
        self.regex = _regex
        super(ImdbCombined, self).__init__(id, timeout)
--- a/ox/web/siteparser.py
+++ b/ox/web/siteparser.py
@ -33,7 +33,7 @@ class SiteParser(dict):
        return "%s%s" % (self.baseUrl, page)

    def read_url(self, url, timeout):
-        if not url in self._cache:
+        if url not in self._cache:
            self._cache[url] = read_url(url, timeout=timeout, unicode=True)
        return self._cache[url]