fix imdb parsing
This commit is contained in:
parent
e1657994ca
commit
a3cef06ad7
1 changed files with 123 additions and 63 deletions
186
ox/web/imdb.py
186
ox/web/imdb.py
|
@ -2,12 +2,13 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import print_function
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import unicodedata
|
||||
|
||||
from six.moves.urllib.parse import urlencode
|
||||
from six import text_type, string_types
|
||||
from six import string_types
|
||||
|
||||
from .. import find_re, strip_tags, decode_html
|
||||
from .. import cache
|
||||
|
@ -106,6 +107,89 @@ def technical(label):
|
|||
}
|
||||
|
||||
|
||||
def tech_spec(metadata):
|
||||
tech = {}
|
||||
for row in metadata['props']['pageProps']['contentData']['section']['items']:
|
||||
title = {
|
||||
'aspect ratio': 'aspectratio',
|
||||
'sound mix': 'sound',
|
||||
}.get(row['rowTitle'].lower(), row['rowTitle'].lower())
|
||||
tech[title] = []
|
||||
for content in row['listContent']:
|
||||
value = content['text']
|
||||
tech[title].append(value)
|
||||
return tech
|
||||
|
||||
|
||||
def movie_connections(metadata):
|
||||
connections = {}
|
||||
for row in metadata['props']['pageProps']['contentData']['categories']:
|
||||
title = {
|
||||
}.get(row['name'], row['name'])
|
||||
if title not in connections:
|
||||
connections[title] = []
|
||||
|
||||
for item in row['section']['items']:
|
||||
item_ = {
|
||||
'id': item['id'][2:],
|
||||
}
|
||||
|
||||
item_['title'] = re.compile('<a.*?>(.*?)</a>').findall(item['listContent'][0]['html'])[0]
|
||||
if len(item['listContent']) >=2:
|
||||
item_['description'] = strip_tags(item['listContent'][1]['html'])
|
||||
connections[title].append(item_)
|
||||
return connections
|
||||
|
||||
|
||||
def get_category_by_id(metadata, id):
|
||||
for category in metadata['props']['pageProps']['contentData']['categories']:
|
||||
if category['id'] == id:
|
||||
return category
|
||||
|
||||
|
||||
def get_release_date(metadata):
|
||||
releases = get_category_by_id(metadata, 'releases')
|
||||
def parse_date(d):
|
||||
parsed = None
|
||||
for fmt in (
|
||||
'%B %d, %Y',
|
||||
'%d %B %Y',
|
||||
'%B %Y',
|
||||
):
|
||||
try:
|
||||
parsed = datetime.strptime(d, fmt)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
if not parsed:
|
||||
return None
|
||||
return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day)
|
||||
|
||||
dates = []
|
||||
for item in releases['section']['items']:
|
||||
content = item['listContent'][0]
|
||||
date = parse_date(content['text'])
|
||||
if date:
|
||||
dates.append(date)
|
||||
|
||||
if dates:
|
||||
return min(dates)
|
||||
|
||||
|
||||
def alternative_titles(metadata):
|
||||
titles = []
|
||||
akas = get_category_by_id(metadata, 'akas')
|
||||
for row in akas['section']['items']:
|
||||
content = row['listContent'][0]
|
||||
titles.append({
|
||||
'title': content['text'],
|
||||
'country': row['rowTitle'],
|
||||
})
|
||||
if content.get('subText'):
|
||||
titles[-1]['subText'] = content['subText']
|
||||
return titles
|
||||
|
||||
|
||||
'''
|
||||
'posterIds': {
|
||||
'page': 'posters',
|
||||
|
@ -116,18 +200,17 @@ def technical(label):
|
|||
|
||||
class Imdb(SiteParser):
|
||||
'''
|
||||
>>> Imdb('0068646')['title'] == text_type(u'The Godfather')
|
||||
>>> Imdb('0068646')['title'] == 'The Godfather'
|
||||
True
|
||||
|
||||
>>> Imdb('0133093')['title'] == text_type(u'The Matrix')
|
||||
>>> Imdb('0133093')['title'] == 'The Matrix'
|
||||
True
|
||||
'''
|
||||
regex = {
|
||||
'alternativeTitles': {
|
||||
'page': 'releaseinfo',
|
||||
're': [
|
||||
'<h4[^>]*?id="akas"[^>]*?>(.*?)</table>',
|
||||
"td[^>]*?>(.*?)</td>.*?<td[^>]*?>(.*?)</td>"
|
||||
'<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">([^>]+)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
|
@ -152,11 +235,6 @@ class Imdb(SiteParser):
|
|||
'type': 'list'
|
||||
},
|
||||
'cinematographer': reference_section('cinematographers'),
|
||||
'connections': {
|
||||
'page': 'movieconnections',
|
||||
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
|
||||
'type': 'list'
|
||||
},
|
||||
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
|
||||
'director': reference_section('directors'),
|
||||
'editor': reference_section('editors'),
|
||||
|
@ -186,7 +264,7 @@ class Imdb(SiteParser):
|
|||
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
||||
'originalTitle': {
|
||||
'page': 'releaseinfo',
|
||||
're': '<td.*?>\s*?\(original title\)\s*?</td>\s*<td.*?>(.*?)</td>',
|
||||
're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
|
||||
'type': 'string'
|
||||
},
|
||||
'summary': zebra_table('Plot Summary', more=[
|
||||
|
@ -219,14 +297,6 @@ class Imdb(SiteParser):
|
|||
],
|
||||
'type': 'float'
|
||||
},
|
||||
'releasedate': {
|
||||
'page': 'releaseinfo',
|
||||
're': [
|
||||
'<td class="release-date-item__date".*?>(.*?)</td>',
|
||||
strip_tags,
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
#FIXME using some /offsite/ redirect now
|
||||
#'reviews': {
|
||||
# 'page': 'externalreviews',
|
||||
|
@ -242,11 +312,6 @@ class Imdb(SiteParser):
|
|||
lambda r: r[0] if isinstance(r, list) else r,
|
||||
strip_tags
|
||||
]),
|
||||
'sound': zebra_list('Sound Mix', more=[
|
||||
'<a.*?>([^(<]+)',
|
||||
lambda r: r[0] if isinstance(r, list) else r,
|
||||
strip_tags
|
||||
]),
|
||||
'season': {
|
||||
'page': 'reference',
|
||||
're': [
|
||||
|
@ -275,7 +340,7 @@ class Imdb(SiteParser):
|
|||
},
|
||||
'title': {
|
||||
'page': 'releaseinfo',
|
||||
're': 'h3 itemprop="name">.*?>(.*?)</a>',
|
||||
're': '<h2.*?>(.*?)</h2>',
|
||||
'type': 'string'
|
||||
},
|
||||
'trivia': {
|
||||
|
@ -314,9 +379,6 @@ class Imdb(SiteParser):
|
|||
},
|
||||
'laboratory': technical('Laboratory'),
|
||||
'camera': technical('Camera'),
|
||||
'negative format': technical('Negative Format'),
|
||||
'cinematographic process': technical('Cinematographic Process'),
|
||||
'printed film format': technical('Printed Film Format'),
|
||||
}
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
|
@ -326,9 +388,24 @@ class Imdb(SiteParser):
|
|||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||
return self._cache[url]
|
||||
|
||||
def get_page_data(self, page, timeout=-1):
|
||||
url = self.get_url(page)
|
||||
data = self.read_url(url, timeout)
|
||||
pdata = re.compile('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', re.DOTALL).findall(data)
|
||||
if pdata:
|
||||
pdata = pdata[0]
|
||||
return json.loads(pdata)
|
||||
return {}
|
||||
|
||||
def __init__(self, id, timeout=-1):
|
||||
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
|
||||
if timeout != 0:
|
||||
self._cache = {}
|
||||
url = self.baseUrl + 'releaseinfo'
|
||||
page = self.read_url(url, timeout=-1)
|
||||
if '<h2>See also</h2>' in page:
|
||||
timeout = 0
|
||||
super(Imdb, self).__init__(timeout)
|
||||
|
||||
url = self.baseUrl + 'reference'
|
||||
|
@ -417,26 +494,6 @@ class Imdb(SiteParser):
|
|||
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
||||
for x in self['cast']]
|
||||
|
||||
if 'connections' in self:
|
||||
cc={}
|
||||
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
|
||||
self['connections'] = [self['connections']]
|
||||
for rel, data, _ in self['connections']:
|
||||
if isinstance(rel, bytes):
|
||||
rel = rel.decode('utf-8')
|
||||
#cc[rel] = re.compile('<a href="/title/tt(\d+)/">(.*?)</a>').findall(data)
|
||||
def get_conn(c):
|
||||
r = {
|
||||
'id': c[0],
|
||||
'title': cleanup_title(c[1]),
|
||||
}
|
||||
description = c[2].split('<br />')
|
||||
if len(description) == 2 and description[-1].strip() != '-':
|
||||
r['description'] = description[-1].strip()
|
||||
return r
|
||||
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
||||
|
||||
self['connections'] = cc
|
||||
|
||||
if 'isSeries' in self:
|
||||
del self['isSeries']
|
||||
|
@ -461,21 +518,10 @@ class Imdb(SiteParser):
|
|||
if 'budget' in self and 'gross' in self:
|
||||
self['profit'] = self['gross'] - self['budget']
|
||||
|
||||
if 'releasedate' in self:
|
||||
def parse_date(d):
|
||||
try:
|
||||
d = datetime.strptime(d, '%d %B %Y')
|
||||
except:
|
||||
try:
|
||||
d = datetime.strptime(d, '%B %Y')
|
||||
except:
|
||||
return 'x'
|
||||
return '%d-%02d-%02d' % (d.year, d.month, d.day)
|
||||
self['releasedate'] = min([
|
||||
parse_date(d) for d in self['releasedate']
|
||||
])
|
||||
if self['releasedate'] == 'x':
|
||||
del self['releasedate']
|
||||
metadata = self.get_page_data('releaseinfo')
|
||||
releasedate = get_release_date(metadata)
|
||||
if releasedate:
|
||||
self['releasedate'] = releasedate
|
||||
|
||||
if 'summary' not in self and 'storyline' in self:
|
||||
self['summary'] = self.pop('storyline')
|
||||
|
@ -483,6 +529,20 @@ class Imdb(SiteParser):
|
|||
if isinstance(self['summary'], list):
|
||||
self['summary'] = self['summary'][0]
|
||||
self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip()
|
||||
else:
|
||||
|
||||
try:
|
||||
summary = metadata['props']['pageProps']['contentData']['entityMetadata']['plot']['plotText']['plainText']
|
||||
self['summary'] = summary
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
self['connections'] = movie_connections(self.get_page_data('movieconnections'))
|
||||
spec = tech_spec(self.get_page_data('technical'))
|
||||
for key in spec:
|
||||
if not self.get(key):
|
||||
self[key] = spec[key]
|
||||
|
||||
if 'credits' in self:
|
||||
credits = [
|
||||
|
|
Loading…
Reference in a new issue