From a3cef06ad73a1419c01c3552842b52948b178c9b Mon Sep 17 00:00:00 2001 From: j Date: Fri, 3 Feb 2023 18:28:49 +0100 Subject: [PATCH] fix imdb parsing --- ox/web/imdb.py | 186 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 123 insertions(+), 63 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 06e3e9d..d683973 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -2,12 +2,13 @@ # vi:si:et:sw=4:sts=4:ts=4 from __future__ import print_function +import json import re import time import unicodedata from six.moves.urllib.parse import urlencode -from six import text_type, string_types +from six import string_types from .. import find_re, strip_tags, decode_html from .. import cache @@ -106,6 +107,89 @@ def technical(label): } +def tech_spec(metadata): + tech = {} + for row in metadata['props']['pageProps']['contentData']['section']['items']: + title = { + 'aspect ratio': 'aspectratio', + 'sound mix': 'sound', + }.get(row['rowTitle'].lower(), row['rowTitle'].lower()) + tech[title] = [] + for content in row['listContent']: + value = content['text'] + tech[title].append(value) + return tech + + +def movie_connections(metadata): + connections = {} + for row in metadata['props']['pageProps']['contentData']['categories']: + title = { + }.get(row['name'], row['name']) + if title not in connections: + connections[title] = [] + + for item in row['section']['items']: + item_ = { + 'id': item['id'][2:], + } + + item_['title'] = re.compile('(.*?)').findall(item['listContent'][0]['html'])[0] + if len(item['listContent']) >=2: + item_['description'] = strip_tags(item['listContent'][1]['html']) + connections[title].append(item_) + return connections + + +def get_category_by_id(metadata, id): + for category in metadata['props']['pageProps']['contentData']['categories']: + if category['id'] == id: + return category + + +def get_release_date(metadata): + releases = get_category_by_id(metadata, 'releases') + def parse_date(d): + parsed = None + for fmt in ( + '%B %d, %Y', + '%d %B %Y', + '%B %Y', + ): + try: + parsed = datetime.strptime(d, fmt) + break + except: + pass + if not parsed: + return None + return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day) + + dates = [] + for item in releases['section']['items']: + content = item['listContent'][0] + date = parse_date(content['text']) + if date: + dates.append(date) + + if dates: + return min(dates) + + +def alternative_titles(metadata): + titles = [] + akas = get_category_by_id(metadata, 'akas') + for row in akas['section']['items']: + content = row['listContent'][0] + titles.append({ + 'title': content['text'], + 'country': row['rowTitle'], + }) + if content.get('subText'): + titles[-1]['subText'] = content['subText'] + return titles + + ''' 'posterIds': { 'page': 'posters', @@ -116,18 +200,17 @@ def technical(label): class Imdb(SiteParser): ''' - >>> Imdb('0068646')['title'] == text_type(u'The Godfather') + >>> Imdb('0068646')['title'] == 'The Godfather' True - >>> Imdb('0133093')['title'] == text_type(u'The Matrix') + >>> Imdb('0133093')['title'] == 'The Matrix' True ''' regex = { 'alternativeTitles': { 'page': 'releaseinfo', 're': [ - ']*?id="akas"[^>]*?>(.*?)', - "td[^>]*?>(.*?).*?]*?>(.*?)" + '