From d630f4b19c63808fadd76bc774982f27d1f82d70 Mon Sep 17 00:00:00 2001 From: j Date: Thu, 6 Jul 2023 18:32:45 +0530 Subject: [PATCH] parse keywords --- ox/web/imdb.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index e9f1973..755a63e 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -178,6 +178,17 @@ def get_release_date(metadata): return min(dates) +def get_keywords(metadata): + try: + keywords = [ + row['rowTitle'] + for row in metadata['props']['pageProps']['contentData']['section']['items'] + ] + except: + keywords = [] + return keywords + + def get_entity_metadata(metadata): data = {} entity = metadata['props']['pageProps']['contentData']['entityMetadata'] @@ -276,11 +287,6 @@ class Imdb(SiteParser): 'gross': zebra_table('Cumulative Worldwide Gross', more=[ lambda data: find_re(decode_html(data).replace(',', ''), '\d+') ], type='int'), - 'keyword': { - 'page': 'keywords', - 're': 'data-item-keyword="(.*?)"', - 'type': 'list' - }, 'language': zebra_list('Language', more=['(.*?)']), 'originalTitle': { 'page': 'releaseinfo', @@ -543,6 +549,11 @@ class Imdb(SiteParser): if releasedate: self['releasedate'] = releasedate + metadata = self.get_page_data('keywords') + keywords = get_keywords(metadata) + if keywords: + self['keywords'] = keywords + if 'summary' not in self and 'storyline' in self: self['summary'] = self.pop('storyline') if 'summary' in self: