simple title detection (imdb)
This commit is contained in:
parent
b7b4b09f0f
commit
35534254c3
2 changed files with 27 additions and 131 deletions
|
@ -16,11 +16,11 @@ from chardet.universaldetector import UniversalDetector
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
# Default headers for HTTP requests.
|
# Default headers for HTTP requests.
|
||||||
DEFAULT_HEADERS = {
|
DEFAULT_HEADERS = {
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0',
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
|
||||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
|
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
'Accept-Language': 'en-us,en;q=0.5',
|
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
|
||||||
'Accept-Encoding': 'gzip'
|
'Accept-Encoding': 'gzip',
|
||||||
}
|
}
|
||||||
|
|
||||||
def status(url, data=None, headers=None):
|
def status(url, data=None, headers=None):
|
||||||
|
|
152
ox/web/imdb.py
152
ox/web/imdb.py
|
@ -20,6 +20,8 @@ from ..geo import normalize_country_name
|
||||||
|
|
||||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||||
headers = headers.copy()
|
headers = headers.copy()
|
||||||
|
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
|
||||||
|
headers['X-Forwarded-For'] = '72.21.206.80'
|
||||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||||
|
|
||||||
def get_url(id):
|
def get_url(id):
|
||||||
|
@ -174,6 +176,11 @@ class Imdb(SiteParser):
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
},
|
},
|
||||||
|
'originalTitle': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '<span class="title-extra">(.*?) <i>\(original title\)</i>',
|
||||||
|
'type': 'string'
|
||||||
|
},
|
||||||
'summary': {
|
'summary': {
|
||||||
'page': 'plotsummary',
|
'page': 'plotsummary',
|
||||||
're': '<p class="plotSummary">(.*?)<\/p>',
|
're': '<p class="plotSummary">(.*?)<\/p>',
|
||||||
|
@ -318,14 +325,14 @@ class Imdb(SiteParser):
|
||||||
}
|
}
|
||||||
|
|
||||||
def read_url(self, url, timeout):
|
def read_url(self, url, timeout):
|
||||||
if not url in self._cache:
|
if url not in self._cache:
|
||||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||||
return self._cache[url]
|
return self._cache[url]
|
||||||
|
|
||||||
def __init__(self, id, timeout=-1):
|
def __init__(self, id, timeout=-1):
|
||||||
# use akas.imdb.com to always get original title:
|
# use akas.imdb.com to always get original title:
|
||||||
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||||
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
|
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
|
||||||
super(Imdb, self).__init__(timeout)
|
super(Imdb, self).__init__(timeout)
|
||||||
|
|
||||||
url = self.baseUrl + 'combined'
|
url = self.baseUrl + 'combined'
|
||||||
|
@ -349,113 +356,6 @@ class Imdb(SiteParser):
|
||||||
if 'sound' in self:
|
if 'sound' in self:
|
||||||
self['sound'] = list(set(self['sound']))
|
self['sound'] = list(set(self['sound']))
|
||||||
|
|
||||||
types = {}
|
|
||||||
stop_words = [
|
|
||||||
'alternative spelling',
|
|
||||||
'alternative title',
|
|
||||||
'alternative transliteration',
|
|
||||||
'closing credits title',
|
|
||||||
'complete title',
|
|
||||||
'IMAX version',
|
|
||||||
'informal short title',
|
|
||||||
'International (Spanish title)',
|
|
||||||
'Japan (imdb display title)',
|
|
||||||
'longer version',
|
|
||||||
'new title',
|
|
||||||
'original subtitled version',
|
|
||||||
'pre-release title',
|
|
||||||
'promotional abbreviation',
|
|
||||||
'recut version',
|
|
||||||
'reissue title',
|
|
||||||
'restored version',
|
|
||||||
'script title',
|
|
||||||
'short title',
|
|
||||||
'(subtitle)',
|
|
||||||
'TV title',
|
|
||||||
'working title',
|
|
||||||
'World-wide (Spanish title)',
|
|
||||||
]
|
|
||||||
#ignore english japanese titles
|
|
||||||
#for movies that are not only from japan
|
|
||||||
if ['Japan'] != self.get('country', []):
|
|
||||||
stop_words += [
|
|
||||||
'Japan (English title)'
|
|
||||||
]
|
|
||||||
for t in self.get('alternativeTitles', []):
|
|
||||||
for type in t[0].split('/'):
|
|
||||||
type = type.strip()
|
|
||||||
stop_word = False
|
|
||||||
for key in stop_words:
|
|
||||||
if key in type:
|
|
||||||
stop_word = True
|
|
||||||
break
|
|
||||||
if not stop_word:
|
|
||||||
if not type in types:
|
|
||||||
types[type] = []
|
|
||||||
types[type].append(t[1])
|
|
||||||
titles = {}
|
|
||||||
for type in types:
|
|
||||||
for title in types[type]:
|
|
||||||
if not title in titles:
|
|
||||||
titles[title] = []
|
|
||||||
titles[title].append(type)
|
|
||||||
def select_title(type):
|
|
||||||
title = types[type][0]
|
|
||||||
count = 0
|
|
||||||
if len(types[type]) > 1:
|
|
||||||
for t in types[type]:
|
|
||||||
if len(titles[t]) > count:
|
|
||||||
count = len(titles[t])
|
|
||||||
title = t
|
|
||||||
return title
|
|
||||||
|
|
||||||
#FIXME: does work in python2.6, possible to import from __future__?
|
|
||||||
#types = {type: select_title(type) for type in types}
|
|
||||||
_types = {}
|
|
||||||
for type in types:
|
|
||||||
_types[type] = select_title(type)
|
|
||||||
types = _types
|
|
||||||
|
|
||||||
regexps = [
|
|
||||||
"^.+ \(imdb display title\) \(English title\)$",
|
|
||||||
"^USA \(imdb display title\)$",
|
|
||||||
"^International \(English title\)$",
|
|
||||||
"^International \(English title\)$",
|
|
||||||
"^UK \(imdb display title\)$",
|
|
||||||
"^International \(.+\) \(English title\)$",
|
|
||||||
"^World-wide \(English title\)$",
|
|
||||||
]
|
|
||||||
if 'Hong Kong' in self.get('country', []):
|
|
||||||
regexps += [
|
|
||||||
"Hong Kong \(English title\)"
|
|
||||||
]
|
|
||||||
english_countries = (
|
|
||||||
'USA', 'UK', 'United States', 'United Kingdom',
|
|
||||||
'Australia', 'New Zealand'
|
|
||||||
)
|
|
||||||
if not list(filter(lambda c: c in english_countries, self.get('country', []))):
|
|
||||||
regexps += [
|
|
||||||
"^[^(]+ \(English title\)$",
|
|
||||||
"^.+ \(.+\) \(English title\)$",
|
|
||||||
"^USA$",
|
|
||||||
"^UK$",
|
|
||||||
"^USA \(.+\)$",
|
|
||||||
"^UK \(.+\)$",
|
|
||||||
"^Australia \(.+\)$",
|
|
||||||
"World-wide \(English title\)",
|
|
||||||
"\(literal English title\)",
|
|
||||||
"^International \(.+ title\)$",
|
|
||||||
"^International \(.+\) \(.+ title\)$",
|
|
||||||
]
|
|
||||||
for regexp in regexps:
|
|
||||||
for type in types:
|
|
||||||
if re.compile(regexp).findall(type):
|
|
||||||
#print(types[type], type)
|
|
||||||
self['internationalTitle'] = types[type]
|
|
||||||
break
|
|
||||||
if 'internationalTitle' in self:
|
|
||||||
break
|
|
||||||
|
|
||||||
def cleanup_title(title):
|
def cleanup_title(title):
|
||||||
if title.startswith('"') and title.endswith('"'):
|
if title.startswith('"') and title.endswith('"'):
|
||||||
title = title[1:-1]
|
title = title[1:-1]
|
||||||
|
@ -464,44 +364,40 @@ class Imdb(SiteParser):
|
||||||
title = re.sub('\(\#[.\d]+\)', '', title)
|
title = re.sub('\(\#[.\d]+\)', '', title)
|
||||||
return title.strip()
|
return title.strip()
|
||||||
|
|
||||||
for t in ('title', 'internationalTitle'):
|
for t in ('title', 'originalTitle'):
|
||||||
if t in self:
|
if t in self:
|
||||||
self[t] = cleanup_title(self[t])
|
self[t] = cleanup_title(self[t])
|
||||||
|
|
||||||
if 'internationalTitle' in self and \
|
|
||||||
self.get('title', '').lower() == self['internationalTitle'].lower():
|
|
||||||
del self['internationalTitle']
|
|
||||||
|
|
||||||
if 'alternativeTitles' in self:
|
if 'alternativeTitles' in self:
|
||||||
alt = {}
|
alt = {}
|
||||||
for t in self['alternativeTitles']:
|
for t in self['alternativeTitles']:
|
||||||
title = cleanup_title(t[1])
|
title = cleanup_title(t[1])
|
||||||
if title not in (self.get('title'), self.get('internationalTitle')):
|
if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
|
||||||
if title not in alt:
|
if title not in alt:
|
||||||
alt[title] = []
|
alt[title] = []
|
||||||
for c in t[0].split('/'):
|
for c in t[0].split('/'):
|
||||||
if not '(working title)' in c:
|
for cleanup in ('International', '(working title)', 'World-wide'):
|
||||||
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
|
c = c.replace(cleanup, '')
|
||||||
if c:
|
c = c.split('(')[0].strip()
|
||||||
alt[title].append(c)
|
if c:
|
||||||
|
alt[title].append(c)
|
||||||
self['alternativeTitles'] = []
|
self['alternativeTitles'] = []
|
||||||
for t in sorted(alt, key=lambda a: sorted(alt[a])):
|
for t in sorted(alt, key=lambda a: sorted(alt[a])):
|
||||||
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
|
countries = sorted(set([normalize_country_name(c) or c for c in alt[t]]))
|
||||||
self['alternativeTitles'].append((t, countries))
|
self['alternativeTitles'].append((t, countries))
|
||||||
if not self['alternativeTitles']:
|
if not self['alternativeTitles']:
|
||||||
del self['alternativeTitles']
|
del self['alternativeTitles']
|
||||||
|
|
||||||
if 'internationalTitle' in self:
|
|
||||||
self['originalTitle'] = self['title']
|
|
||||||
self['title'] = self.pop('internationalTitle')
|
|
||||||
|
|
||||||
if 'runtime' in self and self['runtime']:
|
if 'runtime' in self and self['runtime']:
|
||||||
if 'min' in self['runtime']: base=60
|
if 'min' in self['runtime']:
|
||||||
else: base=1
|
base = 60
|
||||||
|
else:
|
||||||
|
base = 1
|
||||||
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
|
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
|
||||||
if 'runtime' in self and not self['runtime']:
|
if 'runtime' in self and not self['runtime']:
|
||||||
del self['runtime']
|
del self['runtime']
|
||||||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
if 'votes' in self:
|
||||||
|
self['votes'] = self['votes'].replace(',', '')
|
||||||
|
|
||||||
if 'cast' in self:
|
if 'cast' in self:
|
||||||
if isinstance(self['cast'][0], string_types):
|
if isinstance(self['cast'][0], string_types):
|
||||||
|
@ -829,7 +725,7 @@ def get_episodes(imdbId, season=None):
|
||||||
url += '?season=%d' % season
|
url += '?season=%d' % season
|
||||||
data = cache.read_url(url)
|
data = cache.read_url(url)
|
||||||
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
||||||
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
|
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
|
||||||
else:
|
else:
|
||||||
data = cache.read_url(url)
|
data = cache.read_url(url)
|
||||||
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
|
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
|
||||||
|
|
Loading…
Reference in a new issue