fix imdb parsing

2023-02-03 18:28:49 +01:00 · 2023-02-03 18:28:49 +01:00 · a3cef06ad7
commit a3cef06ad7
parent e1657994ca
1 changed files with 123 additions and 63 deletions
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -2,12 +2,13 @@
 # vi:si:et:sw=4:sts=4:ts=4
 from __future__ import print_function
 import json
 import re
 import time
 import unicodedata
 from six.moves.urllib.parse import urlencode
-from six import text_type, string_types
+from six import string_types
 from .. import find_re, strip_tags, decode_html
 from .. import cache
@ -106,6 +107,89 @@ def technical(label):
    }
 def tech_spec(metadata):
    tech = {}
    for row in metadata['props']['pageProps']['contentData']['section']['items']:
        title = {
            'aspect ratio': 'aspectratio',
            'sound mix': 'sound',
        }.get(row['rowTitle'].lower(), row['rowTitle'].lower())
        tech[title] = []
        for content in row['listContent']:
            value = content['text']
            tech[title].append(value)
    return tech
 def movie_connections(metadata):
    connections = {}
    for row in metadata['props']['pageProps']['contentData']['categories']:
        title = {
        }.get(row['name'], row['name'])
        if title not in connections:
            connections[title] = []
        for item in row['section']['items']:
            item_ = {
                'id': item['id'][2:],
            }
            item_['title'] = re.compile('<a.*?>(.*?)</a>').findall(item['listContent'][0]['html'])[0]
            if len(item['listContent']) >=2:
                item_['description'] = strip_tags(item['listContent'][1]['html'])
            connections[title].append(item_)
    return connections
 def get_category_by_id(metadata, id):
    for category in metadata['props']['pageProps']['contentData']['categories']:
        if category['id'] == id:
            return category
 def get_release_date(metadata):
    releases = get_category_by_id(metadata, 'releases')
    def parse_date(d):
        parsed = None
        for fmt in (
            '%B %d, %Y',
            '%d %B %Y',
            '%B %Y',
        ):
            try:
                parsed = datetime.strptime(d, fmt)
                break
            except:
                pass
        if not parsed:
            return None
        return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day)
    dates = []
    for item in releases['section']['items']:
        content = item['listContent'][0]
        date = parse_date(content['text'])
        if date:
            dates.append(date)
    if dates:
        return min(dates)
 def alternative_titles(metadata):
    titles = []
    akas = get_category_by_id(metadata, 'akas')
    for row in akas['section']['items']:
        content = row['listContent'][0]
        titles.append({
            'title': content['text'],
            'country': row['rowTitle'],
        })
        if content.get('subText'):
            titles[-1]['subText'] = content['subText']
    return titles
 '''
 'posterIds': {
    'page': 'posters',
@ -116,18 +200,17 @@ def technical(label):
 class Imdb(SiteParser):
    '''
-    >>> Imdb('0068646')['title'] == text_type(u'The Godfather')
+    >>> Imdb('0068646')['title'] == 'The Godfather'
    True
-    >>> Imdb('0133093')['title'] == text_type(u'The Matrix')
+    >>> Imdb('0133093')['title'] == 'The Matrix'
    True
    '''
    regex = {
        'alternativeTitles': {
            'page': 'releaseinfo',
            're': [
-                '<h4[^>]*?id="akas"[^>]*?>(.*?)</table>',
+                '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">([^>]+)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
                "td[^>]*?>(.*?)</td>.*?<td[^>]*?>(.*?)</td>"
            ],
            'type': 'list'
        },
@ -152,11 +235,6 @@ class Imdb(SiteParser):
            'type': 'list'
        },
        'cinematographer': reference_section('cinematographers'),
        'connections': {
            'page': 'movieconnections',
            're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n  <a|<script)',
            'type': 'list'
        },
        'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
        'director': reference_section('directors'),
        'editor': reference_section('editors'),
@ -186,7 +264,7 @@ class Imdb(SiteParser):
        'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
        'originalTitle': {
            'page': 'releaseinfo',
-            're': '<td.*?>\s*?\(original title\)\s*?</td>\s*<td.*?>(.*?)</td>',
+            're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
            'type': 'string'
        },
        'summary': zebra_table('Plot Summary', more=[
@ -219,14 +297,6 @@ class Imdb(SiteParser):
            ],
            'type': 'float'
        },
        'releasedate': {
            'page': 'releaseinfo',
            're': [
                '<td class="release-date-item__date".*?>(.*?)</td>',
                strip_tags,
            ],
            'type': 'list'
        },
        #FIXME using some /offsite/ redirect now
        #'reviews': {
        #    'page': 'externalreviews',
@ -242,11 +312,6 @@ class Imdb(SiteParser):
            lambda r: r[0] if isinstance(r, list) else r,
            strip_tags
        ]),
        'sound': zebra_list('Sound Mix', more=[
            '<a.*?>([^(<]+)',
            lambda r: r[0] if isinstance(r, list) else r,
            strip_tags
        ]),
        'season': {
            'page': 'reference',
            're': [
@ -275,7 +340,7 @@ class Imdb(SiteParser):
        },
        'title': {
            'page': 'releaseinfo',
-            're': 'h3 itemprop="name">.*?>(.*?)</a>',
+            're': '<h2.*?>(.*?)</h2>',
            'type': 'string'
        },
        'trivia': {
@ -314,9 +379,6 @@ class Imdb(SiteParser):
        },
        'laboratory': technical('Laboratory'),
        'camera': technical('Camera'),
        'negative format': technical('Negative Format'),
        'cinematographic process': technical('Cinematographic Process'),
        'printed film format': technical('Printed Film Format'),
    }
    def read_url(self, url, timeout):
@ -326,9 +388,24 @@ class Imdb(SiteParser):
            self._cache[url] = read_url(url, timeout=timeout, unicode=True)
        return self._cache[url]
    def get_page_data(self, page, timeout=-1):
        url = self.get_url(page)
        data = self.read_url(url, timeout)
        pdata = re.compile('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', re.DOTALL).findall(data)
        if pdata:
            pdata = pdata[0]
            return json.loads(pdata)
        return {}
    def __init__(self, id, timeout=-1):
        # http://www.imdb.com/help/show_leaf?titlelanguagedisplay
        self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
        if timeout != 0:
            self._cache = {}
            url = self.baseUrl + 'releaseinfo'
            page = self.read_url(url, timeout=-1)
            if '<h2>See also</h2>' in page:
                timeout = 0
        super(Imdb, self).__init__(timeout)
        url = self.baseUrl + 'reference'
@ -417,26 +494,6 @@ class Imdb(SiteParser):
            self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
                            for x in self['cast']]
        if 'connections' in self:
            cc={}
            if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
                self['connections'] = [self['connections']]
            for rel, data, _ in self['connections']:
                if isinstance(rel, bytes):
                    rel = rel.decode('utf-8')
                #cc[rel] = re.compile('<a href="/title/tt(\d+)/">(.*?)</a>').findall(data)
                def get_conn(c):
                    r = {
                        'id': c[0],
                        'title': cleanup_title(c[1]),
                    }
                    description = c[2].split('<br />')
                    if len(description) == 2 and description[-1].strip() != '-':
                        r['description'] = description[-1].strip()
                    return r
                cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
            self['connections'] = cc
        if 'isSeries' in self:
            del self['isSeries']
@ -461,21 +518,10 @@ class Imdb(SiteParser):
        if 'budget' in self and 'gross' in self:
            self['profit'] = self['gross'] - self['budget']
-        if 'releasedate' in self:
+        metadata = self.get_page_data('releaseinfo')
-            def parse_date(d):
+        releasedate = get_release_date(metadata)
-                try:
+        if releasedate:
-                    d = datetime.strptime(d, '%d %B %Y')
+            self['releasedate'] = releasedate
                except:
                    try:
                        d = datetime.strptime(d, '%B %Y')
                    except:
                        return 'x'
                return '%d-%02d-%02d' % (d.year, d.month, d.day)
            self['releasedate'] = min([
                parse_date(d) for d in self['releasedate']
            ])
            if self['releasedate'] == 'x':
                del self['releasedate']
        if 'summary' not in self and 'storyline' in self:
            self['summary'] = self.pop('storyline')
@ -483,6 +529,20 @@ class Imdb(SiteParser):
            if isinstance(self['summary'], list):
                self['summary'] = self['summary'][0]
            self['summary'] = strip_tags(self['summary'].split('</p')[0]).split('  Written by\n')[0].strip()
        else:
            try:
                summary = metadata['props']['pageProps']['contentData']['entityMetadata']['plot']['plotText']['plainText']
                self['summary'] = summary
            except:
                pass
        self['connections'] = movie_connections(self.get_page_data('movieconnections'))
        spec = tech_spec(self.get_page_data('technical'))
        for key in spec:
            if not self.get(key):
                self[key] = spec[key]
        if 'credits' in self:
            credits = [