Compare commits
No commits in common. "a3cef06ad73a1419c01c3552842b52948b178c9b" and "5919345d3dea34050ce151acd6499472da6b62da" have entirely different histories.
a3cef06ad7
...
5919345d3d
2 changed files with 63 additions and 128 deletions
186
ox/web/imdb.py
186
ox/web/imdb.py
|
|
@ -2,13 +2,12 @@
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import json
|
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlencode
|
from six.moves.urllib.parse import urlencode
|
||||||
from six import string_types
|
from six import text_type, string_types
|
||||||
|
|
||||||
from .. import find_re, strip_tags, decode_html
|
from .. import find_re, strip_tags, decode_html
|
||||||
from .. import cache
|
from .. import cache
|
||||||
|
|
@ -107,89 +106,6 @@ def technical(label):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def tech_spec(metadata):
|
|
||||||
tech = {}
|
|
||||||
for row in metadata['props']['pageProps']['contentData']['section']['items']:
|
|
||||||
title = {
|
|
||||||
'aspect ratio': 'aspectratio',
|
|
||||||
'sound mix': 'sound',
|
|
||||||
}.get(row['rowTitle'].lower(), row['rowTitle'].lower())
|
|
||||||
tech[title] = []
|
|
||||||
for content in row['listContent']:
|
|
||||||
value = content['text']
|
|
||||||
tech[title].append(value)
|
|
||||||
return tech
|
|
||||||
|
|
||||||
|
|
||||||
def movie_connections(metadata):
|
|
||||||
connections = {}
|
|
||||||
for row in metadata['props']['pageProps']['contentData']['categories']:
|
|
||||||
title = {
|
|
||||||
}.get(row['name'], row['name'])
|
|
||||||
if title not in connections:
|
|
||||||
connections[title] = []
|
|
||||||
|
|
||||||
for item in row['section']['items']:
|
|
||||||
item_ = {
|
|
||||||
'id': item['id'][2:],
|
|
||||||
}
|
|
||||||
|
|
||||||
item_['title'] = re.compile('<a.*?>(.*?)</a>').findall(item['listContent'][0]['html'])[0]
|
|
||||||
if len(item['listContent']) >=2:
|
|
||||||
item_['description'] = strip_tags(item['listContent'][1]['html'])
|
|
||||||
connections[title].append(item_)
|
|
||||||
return connections
|
|
||||||
|
|
||||||
|
|
||||||
def get_category_by_id(metadata, id):
|
|
||||||
for category in metadata['props']['pageProps']['contentData']['categories']:
|
|
||||||
if category['id'] == id:
|
|
||||||
return category
|
|
||||||
|
|
||||||
|
|
||||||
def get_release_date(metadata):
|
|
||||||
releases = get_category_by_id(metadata, 'releases')
|
|
||||||
def parse_date(d):
|
|
||||||
parsed = None
|
|
||||||
for fmt in (
|
|
||||||
'%B %d, %Y',
|
|
||||||
'%d %B %Y',
|
|
||||||
'%B %Y',
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
parsed = datetime.strptime(d, fmt)
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
if not parsed:
|
|
||||||
return None
|
|
||||||
return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day)
|
|
||||||
|
|
||||||
dates = []
|
|
||||||
for item in releases['section']['items']:
|
|
||||||
content = item['listContent'][0]
|
|
||||||
date = parse_date(content['text'])
|
|
||||||
if date:
|
|
||||||
dates.append(date)
|
|
||||||
|
|
||||||
if dates:
|
|
||||||
return min(dates)
|
|
||||||
|
|
||||||
|
|
||||||
def alternative_titles(metadata):
|
|
||||||
titles = []
|
|
||||||
akas = get_category_by_id(metadata, 'akas')
|
|
||||||
for row in akas['section']['items']:
|
|
||||||
content = row['listContent'][0]
|
|
||||||
titles.append({
|
|
||||||
'title': content['text'],
|
|
||||||
'country': row['rowTitle'],
|
|
||||||
})
|
|
||||||
if content.get('subText'):
|
|
||||||
titles[-1]['subText'] = content['subText']
|
|
||||||
return titles
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
'posterIds': {
|
'posterIds': {
|
||||||
'page': 'posters',
|
'page': 'posters',
|
||||||
|
|
@ -200,17 +116,18 @@ def alternative_titles(metadata):
|
||||||
|
|
||||||
class Imdb(SiteParser):
|
class Imdb(SiteParser):
|
||||||
'''
|
'''
|
||||||
>>> Imdb('0068646')['title'] == 'The Godfather'
|
>>> Imdb('0068646')['title'] == text_type(u'The Godfather')
|
||||||
True
|
True
|
||||||
|
|
||||||
>>> Imdb('0133093')['title'] == 'The Matrix'
|
>>> Imdb('0133093')['title'] == text_type(u'The Matrix')
|
||||||
True
|
True
|
||||||
'''
|
'''
|
||||||
regex = {
|
regex = {
|
||||||
'alternativeTitles': {
|
'alternativeTitles': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': [
|
're': [
|
||||||
'<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">([^>]+)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
|
'<h4[^>]*?id="akas"[^>]*?>(.*?)</table>',
|
||||||
|
"td[^>]*?>(.*?)</td>.*?<td[^>]*?>(.*?)</td>"
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
},
|
},
|
||||||
|
|
@ -235,6 +152,11 @@ class Imdb(SiteParser):
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
},
|
},
|
||||||
'cinematographer': reference_section('cinematographers'),
|
'cinematographer': reference_section('cinematographers'),
|
||||||
|
'connections': {
|
||||||
|
'page': 'movieconnections',
|
||||||
|
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
|
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
|
||||||
'director': reference_section('directors'),
|
'director': reference_section('directors'),
|
||||||
'editor': reference_section('editors'),
|
'editor': reference_section('editors'),
|
||||||
|
|
@ -264,7 +186,7 @@ class Imdb(SiteParser):
|
||||||
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
||||||
'originalTitle': {
|
'originalTitle': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
|
're': '<td.*?>\s*?\(original title\)\s*?</td>\s*<td.*?>(.*?)</td>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'summary': zebra_table('Plot Summary', more=[
|
'summary': zebra_table('Plot Summary', more=[
|
||||||
|
|
@ -297,6 +219,14 @@ class Imdb(SiteParser):
|
||||||
],
|
],
|
||||||
'type': 'float'
|
'type': 'float'
|
||||||
},
|
},
|
||||||
|
'releasedate': {
|
||||||
|
'page': 'releaseinfo',
|
||||||
|
're': [
|
||||||
|
'<td class="release-date-item__date".*?>(.*?)</td>',
|
||||||
|
strip_tags,
|
||||||
|
],
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
#FIXME using some /offsite/ redirect now
|
#FIXME using some /offsite/ redirect now
|
||||||
#'reviews': {
|
#'reviews': {
|
||||||
# 'page': 'externalreviews',
|
# 'page': 'externalreviews',
|
||||||
|
|
@ -312,6 +242,11 @@ class Imdb(SiteParser):
|
||||||
lambda r: r[0] if isinstance(r, list) else r,
|
lambda r: r[0] if isinstance(r, list) else r,
|
||||||
strip_tags
|
strip_tags
|
||||||
]),
|
]),
|
||||||
|
'sound': zebra_list('Sound Mix', more=[
|
||||||
|
'<a.*?>([^(<]+)',
|
||||||
|
lambda r: r[0] if isinstance(r, list) else r,
|
||||||
|
strip_tags
|
||||||
|
]),
|
||||||
'season': {
|
'season': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
|
|
@ -340,7 +275,7 @@ class Imdb(SiteParser):
|
||||||
},
|
},
|
||||||
'title': {
|
'title': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': '<h2.*?>(.*?)</h2>',
|
're': 'h3 itemprop="name">.*?>(.*?)</a>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'trivia': {
|
'trivia': {
|
||||||
|
|
@ -379,6 +314,9 @@ class Imdb(SiteParser):
|
||||||
},
|
},
|
||||||
'laboratory': technical('Laboratory'),
|
'laboratory': technical('Laboratory'),
|
||||||
'camera': technical('Camera'),
|
'camera': technical('Camera'),
|
||||||
|
'negative format': technical('Negative Format'),
|
||||||
|
'cinematographic process': technical('Cinematographic Process'),
|
||||||
|
'printed film format': technical('Printed Film Format'),
|
||||||
}
|
}
|
||||||
|
|
||||||
def read_url(self, url, timeout):
|
def read_url(self, url, timeout):
|
||||||
|
|
@ -388,24 +326,9 @@ class Imdb(SiteParser):
|
||||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||||
return self._cache[url]
|
return self._cache[url]
|
||||||
|
|
||||||
def get_page_data(self, page, timeout=-1):
|
|
||||||
url = self.get_url(page)
|
|
||||||
data = self.read_url(url, timeout)
|
|
||||||
pdata = re.compile('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', re.DOTALL).findall(data)
|
|
||||||
if pdata:
|
|
||||||
pdata = pdata[0]
|
|
||||||
return json.loads(pdata)
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def __init__(self, id, timeout=-1):
|
def __init__(self, id, timeout=-1):
|
||||||
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||||
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
|
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
|
||||||
if timeout != 0:
|
|
||||||
self._cache = {}
|
|
||||||
url = self.baseUrl + 'releaseinfo'
|
|
||||||
page = self.read_url(url, timeout=-1)
|
|
||||||
if '<h2>See also</h2>' in page:
|
|
||||||
timeout = 0
|
|
||||||
super(Imdb, self).__init__(timeout)
|
super(Imdb, self).__init__(timeout)
|
||||||
|
|
||||||
url = self.baseUrl + 'reference'
|
url = self.baseUrl + 'reference'
|
||||||
|
|
@ -494,6 +417,26 @@ class Imdb(SiteParser):
|
||||||
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
||||||
for x in self['cast']]
|
for x in self['cast']]
|
||||||
|
|
||||||
|
if 'connections' in self:
|
||||||
|
cc={}
|
||||||
|
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
|
||||||
|
self['connections'] = [self['connections']]
|
||||||
|
for rel, data, _ in self['connections']:
|
||||||
|
if isinstance(rel, bytes):
|
||||||
|
rel = rel.decode('utf-8')
|
||||||
|
#cc[rel] = re.compile('<a href="/title/tt(\d+)/">(.*?)</a>').findall(data)
|
||||||
|
def get_conn(c):
|
||||||
|
r = {
|
||||||
|
'id': c[0],
|
||||||
|
'title': cleanup_title(c[1]),
|
||||||
|
}
|
||||||
|
description = c[2].split('<br />')
|
||||||
|
if len(description) == 2 and description[-1].strip() != '-':
|
||||||
|
r['description'] = description[-1].strip()
|
||||||
|
return r
|
||||||
|
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
||||||
|
|
||||||
|
self['connections'] = cc
|
||||||
|
|
||||||
if 'isSeries' in self:
|
if 'isSeries' in self:
|
||||||
del self['isSeries']
|
del self['isSeries']
|
||||||
|
|
@ -518,10 +461,21 @@ class Imdb(SiteParser):
|
||||||
if 'budget' in self and 'gross' in self:
|
if 'budget' in self and 'gross' in self:
|
||||||
self['profit'] = self['gross'] - self['budget']
|
self['profit'] = self['gross'] - self['budget']
|
||||||
|
|
||||||
metadata = self.get_page_data('releaseinfo')
|
if 'releasedate' in self:
|
||||||
releasedate = get_release_date(metadata)
|
def parse_date(d):
|
||||||
if releasedate:
|
try:
|
||||||
self['releasedate'] = releasedate
|
d = datetime.strptime(d, '%d %B %Y')
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
d = datetime.strptime(d, '%B %Y')
|
||||||
|
except:
|
||||||
|
return 'x'
|
||||||
|
return '%d-%02d-%02d' % (d.year, d.month, d.day)
|
||||||
|
self['releasedate'] = min([
|
||||||
|
parse_date(d) for d in self['releasedate']
|
||||||
|
])
|
||||||
|
if self['releasedate'] == 'x':
|
||||||
|
del self['releasedate']
|
||||||
|
|
||||||
if 'summary' not in self and 'storyline' in self:
|
if 'summary' not in self and 'storyline' in self:
|
||||||
self['summary'] = self.pop('storyline')
|
self['summary'] = self.pop('storyline')
|
||||||
|
|
@ -529,20 +483,6 @@ class Imdb(SiteParser):
|
||||||
if isinstance(self['summary'], list):
|
if isinstance(self['summary'], list):
|
||||||
self['summary'] = self['summary'][0]
|
self['summary'] = self['summary'][0]
|
||||||
self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip()
|
self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip()
|
||||||
else:
|
|
||||||
|
|
||||||
try:
|
|
||||||
summary = metadata['props']['pageProps']['contentData']['entityMetadata']['plot']['plotText']['plainText']
|
|
||||||
self['summary'] = summary
|
|
||||||
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
self['connections'] = movie_connections(self.get_page_data('movieconnections'))
|
|
||||||
spec = tech_spec(self.get_page_data('technical'))
|
|
||||||
for key in spec:
|
|
||||||
if not self.get(key):
|
|
||||||
self[key] = spec[key]
|
|
||||||
|
|
||||||
if 'credits' in self:
|
if 'credits' in self:
|
||||||
credits = [
|
credits = [
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
import json
|
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
|
|
||||||
from six import string_types
|
from six import string_types
|
||||||
|
|
@ -78,10 +77,6 @@ class SiteParser(dict):
|
||||||
elif self.regex[key]['type'] == 'date':
|
elif self.regex[key]['type'] == 'date':
|
||||||
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
|
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
|
||||||
data = apply_f(parse_date, data)
|
data = apply_f(parse_date, data)
|
||||||
elif self.regex[key]['type'] == 'json':
|
|
||||||
if isinstance(data, list) and len(data) == 1:
|
|
||||||
data = data[0]
|
|
||||||
data = json.loads(data)
|
|
||||||
if data:
|
if data:
|
||||||
self[key] = data
|
self[key] = data
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue