From e480b8dcbf23aba730717e30dff5c458d5007967 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Sun, 14 Jan 2018 18:24:29 +0100
Subject: [PATCH] update imdb parser, ticket #3068

---
 ox/web/imdb.py       | 301 +++++++++++++++++--------------------------
 ox/web/siteparser.py |   2 +-
 2 files changed, 119 insertions(+), 184 deletions(-)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index c766ecf..b63dc69 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -27,6 +27,52 @@ def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_
 def get_url(id):
     return "http://www.imdb.com/title/tt%s/" % id
 
+
+def reference_section(id):
+    return {
+        'page': 'reference',
+        're': [
+            '<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
+            '<a href="/name/.*?>(.*?)</a>'
+        ],
+        'type': 'list'
+    }
+
+
+def zebra_list(label, more=None):
+    conditions = {
+        'page': 'reference',
+        're': [
+            label + '</td>.*?<ul(.*?)</ul>',
+            '<li.*?>(.*?)</li>'
+        ],
+        'type': 'list',
+    }
+    if more:
+        conditions['re'] += more
+    return conditions
+
+def zebra_table(label, more=None, type='string'):
+    conditions = {
+        'page': 'reference',
+        're': [
+            '_label">' + label + '</td>.*?<td>(.*?)</td>',
+        ],
+        'type': type,
+    }
+    if more:
+        conditions['re'] += more
+    return conditions
+
+
+'''
+'posterIds': {
+    'page': 'posters',
+    're': '/unknown-thumbnail/media/rm(.*?)/tt',
+    'type': 'list'
+},
+'''
+
 class Imdb(SiteParser):
     '''
     >>> Imdb('0068646')['title'] == text_type(u'The Godfather')
@@ -45,49 +91,29 @@ class Imdb(SiteParser):
             'type': 'list'
         },
         'aspectratio': {
-            'page': 'combined',
-            're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
+            'page': 'reference',
+            're': 'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.]+)',
             'type': 'float',
         },
-        'budget': {
-            'page': 'business',
-            're': [
-                '<h5>Budget</h5>\s*?\$(.*?)<br',
-                lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
-            ],
-            'type': 'int'
-        },
+        'budget': zebra_table('Budget', more=[
+            lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
+        ], type='int'),
         'cast': {
-            'page': 'combined',
+            'page': 'reference',
             're': [
-                '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
+                ' <table class="cast_list">(.*?)</table>',
+                '<td.*?itemprop="actor".*?>.*?>(.*?)</a>.*?<td class="character">(.*?)</td>',
                 lambda ll: [strip_tags(l) for l in ll]
-             ],
-            'type': 'list'
-        },
-        'cinematographer': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('Series Crew')[0],
-                'Cinematography by</a>(.*?)</table>',
-                '<a href="/name/.*?/">(.*?)</a>'
             ],
             'type': 'list'
         },
+        'cinematographer': reference_section('cinematographers'),
         'connections': {
             'page': 'movieconnections',
             're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n  <a|<script)',
             'type': 'list'
         },
-        'country': {
-            'page': 'combined',
-            're': [
-                '<div class="info"><h5>Country:</h5>.*?<div class="info">',
-                #'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
-                '<a.*?>(.*?)</a>',
-            ],
-            'type': 'list'
-        },
+        'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
         'creator': {
             'page': '',
             're': [
@@ -97,44 +123,12 @@ class Imdb(SiteParser):
             ],
             'type': 'list'
         },
-        'director': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('<b>Series Crew</b>')[0],
-                'Directed by</a>(.*?)</table>',
-                '<a href="/name/.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
-        '_director': {
-            'page': 'combined',
-            're': [
-                '<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
-                '<a href="/name/.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
-        'editor': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('Series Crew')[0],
-                'Film Editing by</a>(.*?)</table>',
-                '<a href="/name/.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
-        'composer': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('Series Crew')[0],
-                'Original Music by</a>(.*?)</table>',
-                '<a href="/name/.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
+        'director': reference_section('directors'),
+        'editor': reference_section('editors'),
+        'composer': reference_section('composers'),
         'episodeTitle': {
-            'page': 'combined',
-            're': '<div id="tn15title">.*?<em>(.*?)</em>',
+            'page': 'reference',
+            're': '<h3 itemprop="name">(.*?)<',
             'type': 'string'
         },
         'filmingLocations': {
@@ -145,77 +139,44 @@ class Imdb(SiteParser):
             ],
             'type': 'list'
         },
-        'genre': {
-            'page': 'combined',
-            're': [
-                '<h5>Genre:</h5>(.*?)<hr',
-                '<a href="/Sections/Genres/.*?/">(.*?)</a>'
-            ],
-            'type': 'list'
-        },
-        'gross': {
-            'page': 'business',
-            're': [
-                '<h5>Gross</h5>\s*?\$(.*?)<br',
-                lambda data: find_re(data.replace(',', ''), '\d+')
-            ],
-            'type': 'int'
-        },
+        'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>', lambda x: x[0]]),
+        'gross': zebra_table('Cumulative Worldwide Gross', more=[
+            lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
+        ], type='int'),
         'keyword': {
             'page': 'keywords',
             're': '<a href="/keyword/.*?>(.*?)</a>',
             'type': 'list'
         },
-        'language': {
-            'page': 'combined',
-            're': [
-                #'<h5>Language:</h5>.*?<div class="info">',
-                '<h5>Language:</h5>.*?</div>',
-                #'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
-                '<a.*?>(.*?)</a>',
-            ],
-            'type': 'list'
-        },
+        'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
         'originalTitle': {
             'page': 'releaseinfo',
             're': '<td>\(original title\)</td>\s*<td>(.*?)</td>',
             'type': 'string'
         },
-        'summary': {
-            'page': 'plotsummary',
-            're': '<p class="plotSummary">(.*?)<\/p>',
-            'type': 'string'
-        },
+        'summary': zebra_table('Plot Summary', more=[
+            '<p>(.*?)<em'
+        ]),
         'posterId': {
-            'page': 'combined',
-            're': '<img.*?id="primary-poster".*?src="(.*?)".*?>',
+            'page': 'reference',
+            're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
             'type': 'string'
         },
-        'posterIds': {
-            'page': 'posters',
-            're': '/unknown-thumbnail/media/rm(.*?)/tt',
-            'type': 'list'
-        },
-        'producer': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('Series Crew')[0],
-                'Produced by</a>(.*?)</table>',
-                '<a href="/name/.*?/">(.*?)</a>'
-            ],
-            'type': 'list'
-        },
+        'producer': reference_section('producers'),
         'productionCompany': {
-            'page': 'combined',
+            'page': 'reference',
             're': [
-                'Production Companies</b><ul>(.*?)</ul>',
+                'Production Companies.*?<ul(.*?)</ul>',
                 '<a href="/company/.*?/">(.*?)</a>'
             ],
             'type': 'list'
         },
         'rating': {
-            'page': 'combined',
-            're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
+            'page': 'reference',
+            're': [
+                '<div class="ipl-rating-star ">(.*?)</div>',
+                'ipl-rating-star__rating">([\d,.]+?)</span>',
+            ],
             'type': 'float'
         },
         'releasedate': {
@@ -226,59 +187,43 @@ class Imdb(SiteParser):
             ],
             'type': 'list'
         },
-        'reviews': {
-            'page': 'externalreviews',
-            're': [
-                '<ol>(.*?)</ol>',
-                '<li><a href="(http.*?)".*?>(.*?)</a></li>'
-            ],
-            'type': 'list'
-        },
-        'runtime': {
-            'page': 'combined',
-            're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
-            'type': 'string'
-        },
-        'color': {
-            'page': 'combined',
-            're': [
-                '<h5>Color:</h5><div class="info-content">(.*?)</div>',
-                '<a.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
-        'sound': {
-            'page': 'combined',
-            're': [
-                '<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
-                '<a.*?>(.*?)</a>'
-            ],
-            'type': 'list'
-        },
+        #FIXME using some /offsite/ redirect now
+        #'reviews': {
+        #    'page': 'externalreviews',
+        #    're': [
+        #        '<ul class="simpleList">(.*?)</ul>',
+        #        '<li>.*?<a href="(http.*?)".*?>(.*?)</a>.*?</li>'
+        #    ],
+        #    'type': 'list'
+        #},
+        'runtime': zebra_list('Runtime'),
+        'color': zebra_list('Color', more=['<a.*?>(.*?)</a>']),
+        'sound': zebra_list('Sound Mix', more=['<a.*?>(.*?)</a>', lambda x: x[0]]),
+
         'season': {
-            'page': 'combined',
+            'page': 'reference',
             're': [
-                '<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
-                '\(Season (\d+), Episode \d+\)',
+                '<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
+                'Season (\d+)',
              ],
             'type': 'int'
         },
         'episode': {
-            'page': 'combined',
+            'page': 'reference',
             're': [
-                '<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
-                '\(Season \d+, Episode (\d+)\)',
+                '<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
+                'Episode (\d+)',
              ],
             'type': 'int'
         },
         'series': {
-            'page': 'combined',
-            're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
+            'page': 'reference',
+            're': '<h4 itemprop="name">.*?<a href="/title/tt(\d{7})',
             'type': 'string'
         },
         'isSeries': {
-            'page': 'combined',
-            're': '<span class="tv-extra">(TV series|TV mini-series) ',
+            'page': 'reference',
+            're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
             'type': 'string'
         },
         'title': {
@@ -295,22 +240,17 @@ class Imdb(SiteParser):
             'type': 'list',
         },
         'votes': {
-            'page': 'combined',
-            're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
+            'page': 'reference',
+            're': [
+                'class="ipl-rating-star__total-votes">\((.*?)\)',
+                lambda r: r.replace(',', '')
+            ],
             'type': 'string'
         },
-        'writer': {
-            'page': 'combined',
-            're': [
-                lambda data: data.split('Series Crew')[0],
-                'Writing credits</a>(.*?)</table>',
-                '<a href="/name/.*?/">(.*?)</a>'
-            ],
-            'type': 'list'
-        },
+        'writer': reference_section('writers'),
         'year': {
-            'page': 'combined',
-            're': '="og:title" content="[^"]*?\((\d{4}).*?"',
+            'page': 'reference',
+            're': '=["\']og:title["\'] content="[^"]*?\((\d{4}).*?"',
             'type': 'int'
         },
         'credits': {
@@ -335,7 +275,7 @@ class Imdb(SiteParser):
         self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
         super(Imdb, self).__init__(timeout)
 
-        url = self.baseUrl + 'combined' 
+        url = self.baseUrl + 'reference'
         page = self.read_url(url, timeout=-1)
         if '<title>IMDb: Page not found</title>' in page \
             or 'The requested URL was not found on our server.' in page:
@@ -353,8 +293,6 @@ class Imdb(SiteParser):
         if 'country' in self:
             self['country'] = [normalize_country_name(c) or c for c in self['country']]
 
-        if 'sound' in self:
-            self['sound'] = list(set(self['sound']))
 
         def cleanup_title(title):
             if title.startswith('"') and title.endswith('"'):
@@ -389,6 +327,8 @@ class Imdb(SiteParser):
                 del self['alternativeTitles']
 
         if 'runtime' in self and self['runtime']:
+            if isinstance(self['runtime'], list):
+                self['runtime'] = self['runtime'][0]
             if 'min' in self['runtime']:
                 base = 60
             else:
@@ -396,8 +336,9 @@ class Imdb(SiteParser):
             self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
         if 'runtime' in self and not self['runtime']:
             del self['runtime']
-        if 'votes' in self:
-            self['votes'] = self['votes'].replace(',', '')
+
+        if 'sound' in self:
+            self['sound'] = list(sorted(set(self['sound'])))
 
         if 'cast' in self:
             if isinstance(self['cast'][0], string_types):
@@ -405,6 +346,7 @@ class Imdb(SiteParser):
             self['actor'] = [c[0] for c in self['cast']]
             def cleanup_character(c):
                 c = c.replace('(uncredited)', '').strip()
+                c = re.sub('\s+', ' ', c)
                 return c
             self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
                             for x in self['cast']]
@@ -428,18 +370,11 @@ class Imdb(SiteParser):
                     return r
                 cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
 
-
             self['connections'] = cc
 
         for key in ('country', 'genre'):
             if key in self:
                 self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
-        #0092999
-        if '_director' in self:
-            if 'series' in self or 'isSeries' in self:
-                self['creator'] = self.pop('_director')
-            else:
-                del self['_director']
         if 'isSeries' in self:
             del self['isSeries']
             self['isSeries'] = True
@@ -558,7 +493,7 @@ class ImdbCombined(Imdb):
     def __init__(self, id, timeout=-1):
         _regex = {}
         for key in self.regex:
-            if self.regex[key]['page'] in ('combined', 'releaseinfo'):
+            if self.regex[key]['page'] in ('releaseinfo', 'reference'):
                 _regex[key] = self.regex[key]
         self.regex = _regex
         super(ImdbCombined, self).__init__(id, timeout)
diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py
index fa21948..61a79bd 100644
--- a/ox/web/siteparser.py
+++ b/ox/web/siteparser.py
@@ -33,7 +33,7 @@ class SiteParser(dict):
         return "%s%s" % (self.baseUrl, page)
 
     def read_url(self, url, timeout):
-        if not url in self._cache:
+        if url not in self._cache:
             self._cache[url] = read_url(url, timeout=timeout, unicode=True)
         return self._cache[url]