fix imdb parsing
This commit is contained in:
parent
e1657994ca
commit
a3cef06ad7
1 changed files with 123 additions and 63 deletions
186
ox/web/imdb.py
186
ox/web/imdb.py
|
@ -2,12 +2,13 @@
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import json
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlencode
|
from six.moves.urllib.parse import urlencode
|
||||||
from six import text_type, string_types
|
from six import string_types
|
||||||
|
|
||||||
from .. import find_re, strip_tags, decode_html
|
from .. import find_re, strip_tags, decode_html
|
||||||
from .. import cache
|
from .. import cache
|
||||||
|
@ -106,6 +107,89 @@ def technical(label):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def tech_spec(metadata):
|
||||||
|
tech = {}
|
||||||
|
for row in metadata['props']['pageProps']['contentData']['section']['items']:
|
||||||
|
title = {
|
||||||
|
'aspect ratio': 'aspectratio',
|
||||||
|
'sound mix': 'sound',
|
||||||
|
}.get(row['rowTitle'].lower(), row['rowTitle'].lower())
|
||||||
|
tech[title] = []
|
||||||
|
for content in row['listContent']:
|
||||||
|
value = content['text']
|
||||||
|
tech[title].append(value)
|
||||||
|
return tech
|
||||||
|
|
||||||
|
|
||||||
|
def movie_connections(metadata):
|
||||||
|
connections = {}
|
||||||
|
for row in metadata['props']['pageProps']['contentData']['categories']:
|
||||||
|
title = {
|
||||||
|
}.get(row['name'], row['name'])
|
||||||
|
if title not in connections:
|
||||||
|
connections[title] = []
|
||||||
|
|
||||||
|
for item in row['section']['items']:
|
||||||
|
item_ = {
|
||||||
|
'id': item['id'][2:],
|
||||||
|
}
|
||||||
|
|
||||||
|
item_['title'] = re.compile('<a.*?>(.*?)</a>').findall(item['listContent'][0]['html'])[0]
|
||||||
|
if len(item['listContent']) >=2:
|
||||||
|
item_['description'] = strip_tags(item['listContent'][1]['html'])
|
||||||
|
connections[title].append(item_)
|
||||||
|
return connections
|
||||||
|
|
||||||
|
|
||||||
|
def get_category_by_id(metadata, id):
|
||||||
|
for category in metadata['props']['pageProps']['contentData']['categories']:
|
||||||
|
if category['id'] == id:
|
||||||
|
return category
|
||||||
|
|
||||||
|
|
||||||
|
def get_release_date(metadata):
|
||||||
|
releases = get_category_by_id(metadata, 'releases')
|
||||||
|
def parse_date(d):
|
||||||
|
parsed = None
|
||||||
|
for fmt in (
|
||||||
|
'%B %d, %Y',
|
||||||
|
'%d %B %Y',
|
||||||
|
'%B %Y',
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
parsed = datetime.strptime(d, fmt)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if not parsed:
|
||||||
|
return None
|
||||||
|
return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day)
|
||||||
|
|
||||||
|
dates = []
|
||||||
|
for item in releases['section']['items']:
|
||||||
|
content = item['listContent'][0]
|
||||||
|
date = parse_date(content['text'])
|
||||||
|
if date:
|
||||||
|
dates.append(date)
|
||||||
|
|
||||||
|
if dates:
|
||||||
|
return min(dates)
|
||||||
|
|
||||||
|
|
||||||
|
def alternative_titles(metadata):
|
||||||
|
titles = []
|
||||||
|
akas = get_category_by_id(metadata, 'akas')
|
||||||
|
for row in akas['section']['items']:
|
||||||
|
content = row['listContent'][0]
|
||||||
|
titles.append({
|
||||||
|
'title': content['text'],
|
||||||
|
'country': row['rowTitle'],
|
||||||
|
})
|
||||||
|
if content.get('subText'):
|
||||||
|
titles[-1]['subText'] = content['subText']
|
||||||
|
return titles
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
'posterIds': {
|
'posterIds': {
|
||||||
'page': 'posters',
|
'page': 'posters',
|
||||||
|
@ -116,18 +200,17 @@ def technical(label):
|
||||||
|
|
||||||
class Imdb(SiteParser):
|
class Imdb(SiteParser):
|
||||||
'''
|
'''
|
||||||
>>> Imdb('0068646')['title'] == text_type(u'The Godfather')
|
>>> Imdb('0068646')['title'] == 'The Godfather'
|
||||||
True
|
True
|
||||||
|
|
||||||
>>> Imdb('0133093')['title'] == text_type(u'The Matrix')
|
>>> Imdb('0133093')['title'] == 'The Matrix'
|
||||||
True
|
True
|
||||||
'''
|
'''
|
||||||
regex = {
|
regex = {
|
||||||
'alternativeTitles': {
|
'alternativeTitles': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': [
|
're': [
|
||||||
'<h4[^>]*?id="akas"[^>]*?>(.*?)</table>',
|
'<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">([^>]+)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
|
||||||
"td[^>]*?>(.*?)</td>.*?<td[^>]*?>(.*?)</td>"
|
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
},
|
},
|
||||||
|
@ -152,11 +235,6 @@ class Imdb(SiteParser):
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
},
|
},
|
||||||
'cinematographer': reference_section('cinematographers'),
|
'cinematographer': reference_section('cinematographers'),
|
||||||
'connections': {
|
|
||||||
'page': 'movieconnections',
|
|
||||||
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
|
|
||||||
'type': 'list'
|
|
||||||
},
|
|
||||||
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
|
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
|
||||||
'director': reference_section('directors'),
|
'director': reference_section('directors'),
|
||||||
'editor': reference_section('editors'),
|
'editor': reference_section('editors'),
|
||||||
|
@ -186,7 +264,7 @@ class Imdb(SiteParser):
|
||||||
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
||||||
'originalTitle': {
|
'originalTitle': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': '<td.*?>\s*?\(original title\)\s*?</td>\s*<td.*?>(.*?)</td>',
|
're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'summary': zebra_table('Plot Summary', more=[
|
'summary': zebra_table('Plot Summary', more=[
|
||||||
|
@ -219,14 +297,6 @@ class Imdb(SiteParser):
|
||||||
],
|
],
|
||||||
'type': 'float'
|
'type': 'float'
|
||||||
},
|
},
|
||||||
'releasedate': {
|
|
||||||
'page': 'releaseinfo',
|
|
||||||
're': [
|
|
||||||
'<td class="release-date-item__date".*?>(.*?)</td>',
|
|
||||||
strip_tags,
|
|
||||||
],
|
|
||||||
'type': 'list'
|
|
||||||
},
|
|
||||||
#FIXME using some /offsite/ redirect now
|
#FIXME using some /offsite/ redirect now
|
||||||
#'reviews': {
|
#'reviews': {
|
||||||
# 'page': 'externalreviews',
|
# 'page': 'externalreviews',
|
||||||
|
@ -242,11 +312,6 @@ class Imdb(SiteParser):
|
||||||
lambda r: r[0] if isinstance(r, list) else r,
|
lambda r: r[0] if isinstance(r, list) else r,
|
||||||
strip_tags
|
strip_tags
|
||||||
]),
|
]),
|
||||||
'sound': zebra_list('Sound Mix', more=[
|
|
||||||
'<a.*?>([^(<]+)',
|
|
||||||
lambda r: r[0] if isinstance(r, list) else r,
|
|
||||||
strip_tags
|
|
||||||
]),
|
|
||||||
'season': {
|
'season': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
|
@ -275,7 +340,7 @@ class Imdb(SiteParser):
|
||||||
},
|
},
|
||||||
'title': {
|
'title': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': 'h3 itemprop="name">.*?>(.*?)</a>',
|
're': '<h2.*?>(.*?)</h2>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'trivia': {
|
'trivia': {
|
||||||
|
@ -314,9 +379,6 @@ class Imdb(SiteParser):
|
||||||
},
|
},
|
||||||
'laboratory': technical('Laboratory'),
|
'laboratory': technical('Laboratory'),
|
||||||
'camera': technical('Camera'),
|
'camera': technical('Camera'),
|
||||||
'negative format': technical('Negative Format'),
|
|
||||||
'cinematographic process': technical('Cinematographic Process'),
|
|
||||||
'printed film format': technical('Printed Film Format'),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def read_url(self, url, timeout):
|
def read_url(self, url, timeout):
|
||||||
|
@ -326,9 +388,24 @@ class Imdb(SiteParser):
|
||||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||||
return self._cache[url]
|
return self._cache[url]
|
||||||
|
|
||||||
|
def get_page_data(self, page, timeout=-1):
|
||||||
|
url = self.get_url(page)
|
||||||
|
data = self.read_url(url, timeout)
|
||||||
|
pdata = re.compile('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', re.DOTALL).findall(data)
|
||||||
|
if pdata:
|
||||||
|
pdata = pdata[0]
|
||||||
|
return json.loads(pdata)
|
||||||
|
return {}
|
||||||
|
|
||||||
def __init__(self, id, timeout=-1):
|
def __init__(self, id, timeout=-1):
|
||||||
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||||
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
|
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
|
||||||
|
if timeout != 0:
|
||||||
|
self._cache = {}
|
||||||
|
url = self.baseUrl + 'releaseinfo'
|
||||||
|
page = self.read_url(url, timeout=-1)
|
||||||
|
if '<h2>See also</h2>' in page:
|
||||||
|
timeout = 0
|
||||||
super(Imdb, self).__init__(timeout)
|
super(Imdb, self).__init__(timeout)
|
||||||
|
|
||||||
url = self.baseUrl + 'reference'
|
url = self.baseUrl + 'reference'
|
||||||
|
@ -417,26 +494,6 @@ class Imdb(SiteParser):
|
||||||
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
||||||
for x in self['cast']]
|
for x in self['cast']]
|
||||||
|
|
||||||
if 'connections' in self:
|
|
||||||
cc={}
|
|
||||||
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
|
|
||||||
self['connections'] = [self['connections']]
|
|
||||||
for rel, data, _ in self['connections']:
|
|
||||||
if isinstance(rel, bytes):
|
|
||||||
rel = rel.decode('utf-8')
|
|
||||||
#cc[rel] = re.compile('<a href="/title/tt(\d+)/">(.*?)</a>').findall(data)
|
|
||||||
def get_conn(c):
|
|
||||||
r = {
|
|
||||||
'id': c[0],
|
|
||||||
'title': cleanup_title(c[1]),
|
|
||||||
}
|
|
||||||
description = c[2].split('<br />')
|
|
||||||
if len(description) == 2 and description[-1].strip() != '-':
|
|
||||||
r['description'] = description[-1].strip()
|
|
||||||
return r
|
|
||||||
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
|
||||||
|
|
||||||
self['connections'] = cc
|
|
||||||
|
|
||||||
if 'isSeries' in self:
|
if 'isSeries' in self:
|
||||||
del self['isSeries']
|
del self['isSeries']
|
||||||
|
@ -461,21 +518,10 @@ class Imdb(SiteParser):
|
||||||
if 'budget' in self and 'gross' in self:
|
if 'budget' in self and 'gross' in self:
|
||||||
self['profit'] = self['gross'] - self['budget']
|
self['profit'] = self['gross'] - self['budget']
|
||||||
|
|
||||||
if 'releasedate' in self:
|
metadata = self.get_page_data('releaseinfo')
|
||||||
def parse_date(d):
|
releasedate = get_release_date(metadata)
|
||||||
try:
|
if releasedate:
|
||||||
d = datetime.strptime(d, '%d %B %Y')
|
self['releasedate'] = releasedate
|
||||||
except:
|
|
||||||
try:
|
|
||||||
d = datetime.strptime(d, '%B %Y')
|
|
||||||
except:
|
|
||||||
return 'x'
|
|
||||||
return '%d-%02d-%02d' % (d.year, d.month, d.day)
|
|
||||||
self['releasedate'] = min([
|
|
||||||
parse_date(d) for d in self['releasedate']
|
|
||||||
])
|
|
||||||
if self['releasedate'] == 'x':
|
|
||||||
del self['releasedate']
|
|
||||||
|
|
||||||
if 'summary' not in self and 'storyline' in self:
|
if 'summary' not in self and 'storyline' in self:
|
||||||
self['summary'] = self.pop('storyline')
|
self['summary'] = self.pop('storyline')
|
||||||
|
@ -483,6 +529,20 @@ class Imdb(SiteParser):
|
||||||
if isinstance(self['summary'], list):
|
if isinstance(self['summary'], list):
|
||||||
self['summary'] = self['summary'][0]
|
self['summary'] = self['summary'][0]
|
||||||
self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip()
|
self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip()
|
||||||
|
else:
|
||||||
|
|
||||||
|
try:
|
||||||
|
summary = metadata['props']['pageProps']['contentData']['entityMetadata']['plot']['plotText']['plainText']
|
||||||
|
self['summary'] = summary
|
||||||
|
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self['connections'] = movie_connections(self.get_page_data('movieconnections'))
|
||||||
|
spec = tech_spec(self.get_page_data('technical'))
|
||||||
|
for key in spec:
|
||||||
|
if not self.get(key):
|
||||||
|
self[key] = spec[key]
|
||||||
|
|
||||||
if 'credits' in self:
|
if 'credits' in self:
|
||||||
credits = [
|
credits = [
|
||||||
|
|
Loading…
Reference in a new issue