fix imdb parsing

This commit is contained in:
j 2023-02-03 18:28:49 +01:00
parent e1657994ca
commit a3cef06ad7

View file

@ -2,12 +2,13 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function from __future__ import print_function
import json
import re import re
import time import time
import unicodedata import unicodedata
from six.moves.urllib.parse import urlencode from six.moves.urllib.parse import urlencode
from six import text_type, string_types from six import string_types
from .. import find_re, strip_tags, decode_html from .. import find_re, strip_tags, decode_html
from .. import cache from .. import cache
@ -106,6 +107,89 @@ def technical(label):
} }
def tech_spec(metadata):
tech = {}
for row in metadata['props']['pageProps']['contentData']['section']['items']:
title = {
'aspect ratio': 'aspectratio',
'sound mix': 'sound',
}.get(row['rowTitle'].lower(), row['rowTitle'].lower())
tech[title] = []
for content in row['listContent']:
value = content['text']
tech[title].append(value)
return tech
def movie_connections(metadata):
connections = {}
for row in metadata['props']['pageProps']['contentData']['categories']:
title = {
}.get(row['name'], row['name'])
if title not in connections:
connections[title] = []
for item in row['section']['items']:
item_ = {
'id': item['id'][2:],
}
item_['title'] = re.compile('<a.*?>(.*?)</a>').findall(item['listContent'][0]['html'])[0]
if len(item['listContent']) >=2:
item_['description'] = strip_tags(item['listContent'][1]['html'])
connections[title].append(item_)
return connections
def get_category_by_id(metadata, id):
for category in metadata['props']['pageProps']['contentData']['categories']:
if category['id'] == id:
return category
def get_release_date(metadata):
releases = get_category_by_id(metadata, 'releases')
def parse_date(d):
parsed = None
for fmt in (
'%B %d, %Y',
'%d %B %Y',
'%B %Y',
):
try:
parsed = datetime.strptime(d, fmt)
break
except:
pass
if not parsed:
return None
return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day)
dates = []
for item in releases['section']['items']:
content = item['listContent'][0]
date = parse_date(content['text'])
if date:
dates.append(date)
if dates:
return min(dates)
def alternative_titles(metadata):
titles = []
akas = get_category_by_id(metadata, 'akas')
for row in akas['section']['items']:
content = row['listContent'][0]
titles.append({
'title': content['text'],
'country': row['rowTitle'],
})
if content.get('subText'):
titles[-1]['subText'] = content['subText']
return titles
''' '''
'posterIds': { 'posterIds': {
'page': 'posters', 'page': 'posters',
@ -116,18 +200,17 @@ def technical(label):
class Imdb(SiteParser): class Imdb(SiteParser):
''' '''
>>> Imdb('0068646')['title'] == text_type(u'The Godfather') >>> Imdb('0068646')['title'] == 'The Godfather'
True True
>>> Imdb('0133093')['title'] == text_type(u'The Matrix') >>> Imdb('0133093')['title'] == 'The Matrix'
True True
''' '''
regex = { regex = {
'alternativeTitles': { 'alternativeTitles': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': [ 're': [
'<h4[^>]*?id="akas"[^>]*?>(.*?)</table>', '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">([^>]+)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
"td[^>]*?>(.*?)</td>.*?<td[^>]*?>(.*?)</td>"
], ],
'type': 'list' 'type': 'list'
}, },
@ -152,11 +235,6 @@ class Imdb(SiteParser):
'type': 'list' 'type': 'list'
}, },
'cinematographer': reference_section('cinematographers'), 'cinematographer': reference_section('cinematographers'),
'connections': {
'page': 'movieconnections',
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
'type': 'list'
},
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']), 'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
'director': reference_section('directors'), 'director': reference_section('directors'),
'editor': reference_section('editors'), 'editor': reference_section('editors'),
@ -186,7 +264,7 @@ class Imdb(SiteParser):
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']), 'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
'originalTitle': { 'originalTitle': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': '<td.*?>\s*?\(original title\)\s*?</td>\s*<td.*?>(.*?)</td>', 're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
'type': 'string' 'type': 'string'
}, },
'summary': zebra_table('Plot Summary', more=[ 'summary': zebra_table('Plot Summary', more=[
@ -219,14 +297,6 @@ class Imdb(SiteParser):
], ],
'type': 'float' 'type': 'float'
}, },
'releasedate': {
'page': 'releaseinfo',
're': [
'<td class="release-date-item__date".*?>(.*?)</td>',
strip_tags,
],
'type': 'list'
},
#FIXME using some /offsite/ redirect now #FIXME using some /offsite/ redirect now
#'reviews': { #'reviews': {
# 'page': 'externalreviews', # 'page': 'externalreviews',
@ -242,11 +312,6 @@ class Imdb(SiteParser):
lambda r: r[0] if isinstance(r, list) else r, lambda r: r[0] if isinstance(r, list) else r,
strip_tags strip_tags
]), ]),
'sound': zebra_list('Sound Mix', more=[
'<a.*?>([^(<]+)',
lambda r: r[0] if isinstance(r, list) else r,
strip_tags
]),
'season': { 'season': {
'page': 'reference', 'page': 'reference',
're': [ 're': [
@ -275,7 +340,7 @@ class Imdb(SiteParser):
}, },
'title': { 'title': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': 'h3 itemprop="name">.*?>(.*?)</a>', 're': '<h2.*?>(.*?)</h2>',
'type': 'string' 'type': 'string'
}, },
'trivia': { 'trivia': {
@ -314,9 +379,6 @@ class Imdb(SiteParser):
}, },
'laboratory': technical('Laboratory'), 'laboratory': technical('Laboratory'),
'camera': technical('Camera'), 'camera': technical('Camera'),
'negative format': technical('Negative Format'),
'cinematographic process': technical('Cinematographic Process'),
'printed film format': technical('Printed Film Format'),
} }
def read_url(self, url, timeout): def read_url(self, url, timeout):
@ -326,9 +388,24 @@ class Imdb(SiteParser):
self._cache[url] = read_url(url, timeout=timeout, unicode=True) self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url] return self._cache[url]
def get_page_data(self, page, timeout=-1):
url = self.get_url(page)
data = self.read_url(url, timeout)
pdata = re.compile('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', re.DOTALL).findall(data)
if pdata:
pdata = pdata[0]
return json.loads(pdata)
return {}
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay # http://www.imdb.com/help/show_leaf?titlelanguagedisplay
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
if timeout != 0:
self._cache = {}
url = self.baseUrl + 'releaseinfo'
page = self.read_url(url, timeout=-1)
if '<h2>See also</h2>' in page:
timeout = 0
super(Imdb, self).__init__(timeout) super(Imdb, self).__init__(timeout)
url = self.baseUrl + 'reference' url = self.baseUrl + 'reference'
@ -417,26 +494,6 @@ class Imdb(SiteParser):
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])} self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
for x in self['cast']] for x in self['cast']]
if 'connections' in self:
cc={}
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
self['connections'] = [self['connections']]
for rel, data, _ in self['connections']:
if isinstance(rel, bytes):
rel = rel.decode('utf-8')
#cc[rel] = re.compile('<a href="/title/tt(\d+)/">(.*?)</a>').findall(data)
def get_conn(c):
r = {
'id': c[0],
'title': cleanup_title(c[1]),
}
description = c[2].split('<br />')
if len(description) == 2 and description[-1].strip() != '-':
r['description'] = description[-1].strip()
return r
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc
if 'isSeries' in self: if 'isSeries' in self:
del self['isSeries'] del self['isSeries']
@ -461,21 +518,10 @@ class Imdb(SiteParser):
if 'budget' in self and 'gross' in self: if 'budget' in self and 'gross' in self:
self['profit'] = self['gross'] - self['budget'] self['profit'] = self['gross'] - self['budget']
if 'releasedate' in self: metadata = self.get_page_data('releaseinfo')
def parse_date(d): releasedate = get_release_date(metadata)
try: if releasedate:
d = datetime.strptime(d, '%d %B %Y') self['releasedate'] = releasedate
except:
try:
d = datetime.strptime(d, '%B %Y')
except:
return 'x'
return '%d-%02d-%02d' % (d.year, d.month, d.day)
self['releasedate'] = min([
parse_date(d) for d in self['releasedate']
])
if self['releasedate'] == 'x':
del self['releasedate']
if 'summary' not in self and 'storyline' in self: if 'summary' not in self and 'storyline' in self:
self['summary'] = self.pop('storyline') self['summary'] = self.pop('storyline')
@ -483,6 +529,20 @@ class Imdb(SiteParser):
if isinstance(self['summary'], list): if isinstance(self['summary'], list):
self['summary'] = self['summary'][0] self['summary'] = self['summary'][0]
self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip() self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip()
else:
try:
summary = metadata['props']['pageProps']['contentData']['entityMetadata']['plot']['plotText']['plainText']
self['summary'] = summary
except:
pass
self['connections'] = movie_connections(self.get_page_data('movieconnections'))
spec = tech_spec(self.get_page_data('technical'))
for key in spec:
if not self.get(key):
self[key] = spec[key]
if 'credits' in self: if 'credits' in self:
credits = [ credits = [