simple title detection (imdb)
This commit is contained in:
parent
b7b4b09f0f
commit
35534254c3
2 changed files with 27 additions and 131 deletions
|
@ -16,11 +16,11 @@ from chardet.universaldetector import UniversalDetector
|
|||
DEBUG = False
|
||||
# Default headers for HTTP requests.
|
||||
DEFAULT_HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0',
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
|
||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-us,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip'
|
||||
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
|
||||
'Accept-Encoding': 'gzip',
|
||||
}
|
||||
|
||||
def status(url, data=None, headers=None):
|
||||
|
|
146
ox/web/imdb.py
146
ox/web/imdb.py
|
@ -20,6 +20,8 @@ from ..geo import normalize_country_name
|
|||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
|
||||
headers['X-Forwarded-For'] = '72.21.206.80'
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def get_url(id):
|
||||
|
@ -174,6 +176,11 @@ class Imdb(SiteParser):
|
|||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'originalTitle': {
|
||||
'page': 'combined',
|
||||
're': '<span class="title-extra">(.*?) <i>\(original title\)</i>',
|
||||
'type': 'string'
|
||||
},
|
||||
'summary': {
|
||||
'page': 'plotsummary',
|
||||
're': '<p class="plotSummary">(.*?)<\/p>',
|
||||
|
@ -318,14 +325,14 @@ class Imdb(SiteParser):
|
|||
}
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
if not url in self._cache:
|
||||
if url not in self._cache:
|
||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||
return self._cache[url]
|
||||
|
||||
def __init__(self, id, timeout=-1):
|
||||
# use akas.imdb.com to always get original title:
|
||||
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
|
||||
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
|
||||
super(Imdb, self).__init__(timeout)
|
||||
|
||||
url = self.baseUrl + 'combined'
|
||||
|
@ -349,113 +356,6 @@ class Imdb(SiteParser):
|
|||
if 'sound' in self:
|
||||
self['sound'] = list(set(self['sound']))
|
||||
|
||||
types = {}
|
||||
stop_words = [
|
||||
'alternative spelling',
|
||||
'alternative title',
|
||||
'alternative transliteration',
|
||||
'closing credits title',
|
||||
'complete title',
|
||||
'IMAX version',
|
||||
'informal short title',
|
||||
'International (Spanish title)',
|
||||
'Japan (imdb display title)',
|
||||
'longer version',
|
||||
'new title',
|
||||
'original subtitled version',
|
||||
'pre-release title',
|
||||
'promotional abbreviation',
|
||||
'recut version',
|
||||
'reissue title',
|
||||
'restored version',
|
||||
'script title',
|
||||
'short title',
|
||||
'(subtitle)',
|
||||
'TV title',
|
||||
'working title',
|
||||
'World-wide (Spanish title)',
|
||||
]
|
||||
#ignore english japanese titles
|
||||
#for movies that are not only from japan
|
||||
if ['Japan'] != self.get('country', []):
|
||||
stop_words += [
|
||||
'Japan (English title)'
|
||||
]
|
||||
for t in self.get('alternativeTitles', []):
|
||||
for type in t[0].split('/'):
|
||||
type = type.strip()
|
||||
stop_word = False
|
||||
for key in stop_words:
|
||||
if key in type:
|
||||
stop_word = True
|
||||
break
|
||||
if not stop_word:
|
||||
if not type in types:
|
||||
types[type] = []
|
||||
types[type].append(t[1])
|
||||
titles = {}
|
||||
for type in types:
|
||||
for title in types[type]:
|
||||
if not title in titles:
|
||||
titles[title] = []
|
||||
titles[title].append(type)
|
||||
def select_title(type):
|
||||
title = types[type][0]
|
||||
count = 0
|
||||
if len(types[type]) > 1:
|
||||
for t in types[type]:
|
||||
if len(titles[t]) > count:
|
||||
count = len(titles[t])
|
||||
title = t
|
||||
return title
|
||||
|
||||
#FIXME: does work in python2.6, possible to import from __future__?
|
||||
#types = {type: select_title(type) for type in types}
|
||||
_types = {}
|
||||
for type in types:
|
||||
_types[type] = select_title(type)
|
||||
types = _types
|
||||
|
||||
regexps = [
|
||||
"^.+ \(imdb display title\) \(English title\)$",
|
||||
"^USA \(imdb display title\)$",
|
||||
"^International \(English title\)$",
|
||||
"^International \(English title\)$",
|
||||
"^UK \(imdb display title\)$",
|
||||
"^International \(.+\) \(English title\)$",
|
||||
"^World-wide \(English title\)$",
|
||||
]
|
||||
if 'Hong Kong' in self.get('country', []):
|
||||
regexps += [
|
||||
"Hong Kong \(English title\)"
|
||||
]
|
||||
english_countries = (
|
||||
'USA', 'UK', 'United States', 'United Kingdom',
|
||||
'Australia', 'New Zealand'
|
||||
)
|
||||
if not list(filter(lambda c: c in english_countries, self.get('country', []))):
|
||||
regexps += [
|
||||
"^[^(]+ \(English title\)$",
|
||||
"^.+ \(.+\) \(English title\)$",
|
||||
"^USA$",
|
||||
"^UK$",
|
||||
"^USA \(.+\)$",
|
||||
"^UK \(.+\)$",
|
||||
"^Australia \(.+\)$",
|
||||
"World-wide \(English title\)",
|
||||
"\(literal English title\)",
|
||||
"^International \(.+ title\)$",
|
||||
"^International \(.+\) \(.+ title\)$",
|
||||
]
|
||||
for regexp in regexps:
|
||||
for type in types:
|
||||
if re.compile(regexp).findall(type):
|
||||
#print(types[type], type)
|
||||
self['internationalTitle'] = types[type]
|
||||
break
|
||||
if 'internationalTitle' in self:
|
||||
break
|
||||
|
||||
def cleanup_title(title):
|
||||
if title.startswith('"') and title.endswith('"'):
|
||||
title = title[1:-1]
|
||||
|
@ -464,44 +364,40 @@ class Imdb(SiteParser):
|
|||
title = re.sub('\(\#[.\d]+\)', '', title)
|
||||
return title.strip()
|
||||
|
||||
for t in ('title', 'internationalTitle'):
|
||||
for t in ('title', 'originalTitle'):
|
||||
if t in self:
|
||||
self[t] = cleanup_title(self[t])
|
||||
|
||||
if 'internationalTitle' in self and \
|
||||
self.get('title', '').lower() == self['internationalTitle'].lower():
|
||||
del self['internationalTitle']
|
||||
|
||||
if 'alternativeTitles' in self:
|
||||
alt = {}
|
||||
for t in self['alternativeTitles']:
|
||||
title = cleanup_title(t[1])
|
||||
if title not in (self.get('title'), self.get('internationalTitle')):
|
||||
if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
|
||||
if title not in alt:
|
||||
alt[title] = []
|
||||
for c in t[0].split('/'):
|
||||
if not '(working title)' in c:
|
||||
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
|
||||
for cleanup in ('International', '(working title)', 'World-wide'):
|
||||
c = c.replace(cleanup, '')
|
||||
c = c.split('(')[0].strip()
|
||||
if c:
|
||||
alt[title].append(c)
|
||||
self['alternativeTitles'] = []
|
||||
for t in sorted(alt, key=lambda a: sorted(alt[a])):
|
||||
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
|
||||
countries = sorted(set([normalize_country_name(c) or c for c in alt[t]]))
|
||||
self['alternativeTitles'].append((t, countries))
|
||||
if not self['alternativeTitles']:
|
||||
del self['alternativeTitles']
|
||||
|
||||
if 'internationalTitle' in self:
|
||||
self['originalTitle'] = self['title']
|
||||
self['title'] = self.pop('internationalTitle')
|
||||
|
||||
if 'runtime' in self and self['runtime']:
|
||||
if 'min' in self['runtime']: base=60
|
||||
else: base=1
|
||||
if 'min' in self['runtime']:
|
||||
base = 60
|
||||
else:
|
||||
base = 1
|
||||
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
|
||||
if 'runtime' in self and not self['runtime']:
|
||||
del self['runtime']
|
||||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||
if 'votes' in self:
|
||||
self['votes'] = self['votes'].replace(',', '')
|
||||
|
||||
if 'cast' in self:
|
||||
if isinstance(self['cast'][0], string_types):
|
||||
|
|
Loading…
Reference in a new issue