fix imdb parsing

2023-02-03 18:28:49 +01:00 · 2023-02-03 18:28:49 +01:00 · a3cef06ad7
commit a3cef06ad7
parent e1657994ca
1 changed files with 123 additions and 63 deletions
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -2,12 +2,13 @@
 # vi:si:et:sw=4:sts=4:ts=4
 from __future__ import print_function

+import json
 import re
 import time
 import unicodedata

 from six.moves.urllib.parse import urlencode
-from six import text_type, string_types
+from six import string_types

 from .. import find_re, strip_tags, decode_html
 from .. import cache
@ -106,6 +107,89 @@ def technical(label):
    }


+def tech_spec(metadata):
+    tech = {}
+    for row in metadata['props']['pageProps']['contentData']['section']['items']:
+        title = {
+            'aspect ratio': 'aspectratio',
+            'sound mix': 'sound',
+        }.get(row['rowTitle'].lower(), row['rowTitle'].lower())
+        tech[title] = []
+        for content in row['listContent']:
+            value = content['text']
+            tech[title].append(value)
+    return tech
+
+
+def movie_connections(metadata):
+    connections = {}
+    for row in metadata['props']['pageProps']['contentData']['categories']:
+        title = {
+        }.get(row['name'], row['name'])
+        if title not in connections:
+            connections[title] = []
+
+        for item in row['section']['items']:
+            item_ = {
+                'id': item['id'][2:],
+            }
+
+            item_['title'] = re.compile('<a.*?>(.*?)</a>').findall(item['listContent'][0]['html'])[0]
+            if len(item['listContent']) >=2:
+                item_['description'] = strip_tags(item['listContent'][1]['html'])
+            connections[title].append(item_)
+    return connections
+
+
+def get_category_by_id(metadata, id):
+    for category in metadata['props']['pageProps']['contentData']['categories']:
+        if category['id'] == id:
+            return category
+
+
+def get_release_date(metadata):
+    releases = get_category_by_id(metadata, 'releases')
+    def parse_date(d):
+        parsed = None
+        for fmt in (
+            '%B %d, %Y',
+            '%d %B %Y',
+            '%B %Y',
+        ):
+            try:
+                parsed = datetime.strptime(d, fmt)
+                break
+            except:
+                pass
+        if not parsed:
+            return None
+        return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day)
+
+    dates = []
+    for item in releases['section']['items']:
+        content = item['listContent'][0]
+        date = parse_date(content['text'])
+        if date:
+            dates.append(date)
+
+    if dates:
+        return min(dates)
+
+
+def alternative_titles(metadata):
+    titles = []
+    akas = get_category_by_id(metadata, 'akas')
+    for row in akas['section']['items']:
+        content = row['listContent'][0]
+        titles.append({
+            'title': content['text'],
+            'country': row['rowTitle'],
+        })
+        if content.get('subText'):
+            titles[-1]['subText'] = content['subText']
+    return titles
+
+
 '''
 'posterIds': {
    'page': 'posters',
@ -116,18 +200,17 @@ def technical(label):

 class Imdb(SiteParser):
    '''
-    >>> Imdb('0068646')['title'] == text_type(u'The Godfather')
+    >>> Imdb('0068646')['title'] == 'The Godfather'
    True

-    >>> Imdb('0133093')['title'] == text_type(u'The Matrix')
+    >>> Imdb('0133093')['title'] == 'The Matrix'
    True
    '''
    regex = {
        'alternativeTitles': {
            'page': 'releaseinfo',
            're': [
-                '<h4[^>]*?id="akas"[^>]*?>(.*?)</table>',
-                "td[^>]*?>(.*?)</td>.*?<td[^>]*?>(.*?)</td>"
+                '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">([^>]+)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
            ],
            'type': 'list'
        },
@ -152,11 +235,6 @@ class Imdb(SiteParser):
            'type': 'list'
        },
        'cinematographer': reference_section('cinematographers'),
-        'connections': {
-            'page': 'movieconnections',
-            're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n  <a|<script)',
-            'type': 'list'
-        },
        'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
        'director': reference_section('directors'),
        'editor': reference_section('editors'),
@ -186,7 +264,7 @@ class Imdb(SiteParser):
        'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
        'originalTitle': {
            'page': 'releaseinfo',
-            're': '<td.*?>\s*?\(original title\)\s*?</td>\s*<td.*?>(.*?)</td>',
+            're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
            'type': 'string'
        },
        'summary': zebra_table('Plot Summary', more=[
@ -219,14 +297,6 @@ class Imdb(SiteParser):
            ],
            'type': 'float'
        },
-        'releasedate': {
-            'page': 'releaseinfo',
-            're': [
-                '<td class="release-date-item__date".*?>(.*?)</td>',
-                strip_tags,
-            ],
-            'type': 'list'
-        },
        #FIXME using some /offsite/ redirect now
        #'reviews': {
        #    'page': 'externalreviews',
@ -242,11 +312,6 @@ class Imdb(SiteParser):
            lambda r: r[0] if isinstance(r, list) else r,
            strip_tags
        ]),
-        'sound': zebra_list('Sound Mix', more=[
-            '<a.*?>([^(<]+)',
-            lambda r: r[0] if isinstance(r, list) else r,
-            strip_tags
-        ]),
        'season': {
            'page': 'reference',
            're': [
@ -275,7 +340,7 @@ class Imdb(SiteParser):
        },
        'title': {
            'page': 'releaseinfo',
-            're': 'h3 itemprop="name">.*?>(.*?)</a>',
+            're': '<h2.*?>(.*?)</h2>',
            'type': 'string'
        },
        'trivia': {
@ -314,9 +379,6 @@ class Imdb(SiteParser):
        },
        'laboratory': technical('Laboratory'),
        'camera': technical('Camera'),
-        'negative format': technical('Negative Format'),
-        'cinematographic process': technical('Cinematographic Process'),
-        'printed film format': technical('Printed Film Format'),
    }

    def read_url(self, url, timeout):
@ -326,9 +388,24 @@ class Imdb(SiteParser):
            self._cache[url] = read_url(url, timeout=timeout, unicode=True)
        return self._cache[url]

+    def get_page_data(self, page, timeout=-1):
+        url = self.get_url(page)
+        data = self.read_url(url, timeout)
+        pdata = re.compile('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', re.DOTALL).findall(data)
+        if pdata:
+            pdata = pdata[0]
+            return json.loads(pdata)
+        return {}
+
    def __init__(self, id, timeout=-1):
        # http://www.imdb.com/help/show_leaf?titlelanguagedisplay
        self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
+        if timeout != 0:
+            self._cache = {}
+            url = self.baseUrl + 'releaseinfo'
+            page = self.read_url(url, timeout=-1)
+            if '<h2>See also</h2>' in page:
+                timeout = 0
        super(Imdb, self).__init__(timeout)

        url = self.baseUrl + 'reference'
@ -417,26 +494,6 @@ class Imdb(SiteParser):
            self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
                            for x in self['cast']]

-        if 'connections' in self:
-            cc={}
-            if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
-                self['connections'] = [self['connections']]
-            for rel, data, _ in self['connections']:
-                if isinstance(rel, bytes):
-                    rel = rel.decode('utf-8')
-                #cc[rel] = re.compile('<a href="/title/tt(\d+)/">(.*?)</a>').findall(data)
-                def get_conn(c):
-                    r = {
-                        'id': c[0],
-                        'title': cleanup_title(c[1]),
-                    }
-                    description = c[2].split('<br />')
-                    if len(description) == 2 and description[-1].strip() != '-':
-                        r['description'] = description[-1].strip()
-                    return r
-                cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
-
-            self['connections'] = cc

        if 'isSeries' in self:
            del self['isSeries']
@ -461,21 +518,10 @@ class Imdb(SiteParser):
        if 'budget' in self and 'gross' in self:
            self['profit'] = self['gross'] - self['budget']

-        if 'releasedate' in self:
-            def parse_date(d):
-                try:
-                    d = datetime.strptime(d, '%d %B %Y')
-                except:
-                    try:
-                        d = datetime.strptime(d, '%B %Y')
-                    except:
-                        return 'x'
-                return '%d-%02d-%02d' % (d.year, d.month, d.day)
-            self['releasedate'] = min([
-                parse_date(d) for d in self['releasedate']
-            ])
-            if self['releasedate'] == 'x':
-                del self['releasedate']
+        metadata = self.get_page_data('releaseinfo')
+        releasedate = get_release_date(metadata)
+        if releasedate:
+            self['releasedate'] = releasedate

        if 'summary' not in self and 'storyline' in self:
            self['summary'] = self.pop('storyline')
@ -483,6 +529,20 @@ class Imdb(SiteParser):
            if isinstance(self['summary'], list):
                self['summary'] = self['summary'][0]
            self['summary'] = strip_tags(self['summary'].split('</p')[0]).split('  Written by\n')[0].strip()
+        else:
+
+            try:
+                summary = metadata['props']['pageProps']['contentData']['entityMetadata']['plot']['plotText']['plainText']
+                self['summary'] = summary
+
+            except:
+                pass
+
+        self['connections'] = movie_connections(self.get_page_data('movieconnections'))
+        spec = tech_spec(self.get_page_data('technical'))
+        for key in spec:
+            if not self.get(key):
+                self[key] = spec[key]

        if 'credits' in self:
            credits = [