From 35534254c348362205c92fc6ff483d084a863e55 Mon Sep 17 00:00:00 2001 From: j Date: Wed, 2 Aug 2017 16:48:22 +0200 Subject: [PATCH] simple title detection (imdb) --- ox/net.py | 6 +- ox/web/imdb.py | 152 ++++++++----------------------------------------- 2 files changed, 27 insertions(+), 131 deletions(-) diff --git a/ox/net.py b/ox/net.py index 485fd3f..02c7156 100644 --- a/ox/net.py +++ b/ox/net.py @@ -16,11 +16,11 @@ from chardet.universaldetector import UniversalDetector DEBUG = False # Default headers for HTTP requests. DEFAULT_HEADERS = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-us,en;q=0.5', - 'Accept-Encoding': 'gzip' + 'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4', + 'Accept-Encoding': 'gzip', } def status(url, data=None, headers=None): diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 2fa3024..cf93bef 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -20,6 +20,8 @@ from ..geo import normalize_country_name def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False): headers = headers.copy() + # https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau + headers['X-Forwarded-For'] = '72.21.206.80' return cache.read_url(url, data, headers, timeout, unicode=unicode) def get_url(id): @@ -174,6 +176,11 @@ class Imdb(SiteParser): ], 'type': 'list' }, + 'originalTitle': { + 'page': 'combined', + 're': '(.*?) \(original title\)', + 'type': 'string' + }, 'summary': { 'page': 'plotsummary', 're': '

(.*?)<\/p>', @@ -318,14 +325,14 @@ class Imdb(SiteParser): } def read_url(self, url, timeout): - if not url in self._cache: + if url not in self._cache: self._cache[url] = read_url(url, timeout=timeout, unicode=True) return self._cache[url] def __init__(self, id, timeout=-1): # use akas.imdb.com to always get original title: # http://www.imdb.com/help/show_leaf?titlelanguagedisplay - self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id + self.baseUrl = "http://www.imdb.com/title/tt%s/" % id super(Imdb, self).__init__(timeout) url = self.baseUrl + 'combined' @@ -349,113 +356,6 @@ class Imdb(SiteParser): if 'sound' in self: self['sound'] = list(set(self['sound'])) - types = {} - stop_words = [ - 'alternative spelling', - 'alternative title', - 'alternative transliteration', - 'closing credits title', - 'complete title', - 'IMAX version', - 'informal short title', - 'International (Spanish title)', - 'Japan (imdb display title)', - 'longer version', - 'new title', - 'original subtitled version', - 'pre-release title', - 'promotional abbreviation', - 'recut version', - 'reissue title', - 'restored version', - 'script title', - 'short title', - '(subtitle)', - 'TV title', - 'working title', - 'World-wide (Spanish title)', - ] - #ignore english japanese titles - #for movies that are not only from japan - if ['Japan'] != self.get('country', []): - stop_words += [ - 'Japan (English title)' - ] - for t in self.get('alternativeTitles', []): - for type in t[0].split('/'): - type = type.strip() - stop_word = False - for key in stop_words: - if key in type: - stop_word = True - break - if not stop_word: - if not type in types: - types[type] = [] - types[type].append(t[1]) - titles = {} - for type in types: - for title in types[type]: - if not title in titles: - titles[title] = [] - titles[title].append(type) - def select_title(type): - title = types[type][0] - count = 0 - if len(types[type]) > 1: - for t in types[type]: - if len(titles[t]) > count: - count = len(titles[t]) - title = t - return title - - #FIXME: does work in python2.6, possible to import from __future__? - #types = {type: select_title(type) for type in types} - _types = {} - for type in types: - _types[type] = select_title(type) - types = _types - - regexps = [ - "^.+ \(imdb display title\) \(English title\)$", - "^USA \(imdb display title\)$", - "^International \(English title\)$", - "^International \(English title\)$", - "^UK \(imdb display title\)$", - "^International \(.+\) \(English title\)$", - "^World-wide \(English title\)$", - ] - if 'Hong Kong' in self.get('country', []): - regexps += [ - "Hong Kong \(English title\)" - ] - english_countries = ( - 'USA', 'UK', 'United States', 'United Kingdom', - 'Australia', 'New Zealand' - ) - if not list(filter(lambda c: c in english_countries, self.get('country', []))): - regexps += [ - "^[^(]+ \(English title\)$", - "^.+ \(.+\) \(English title\)$", - "^USA$", - "^UK$", - "^USA \(.+\)$", - "^UK \(.+\)$", - "^Australia \(.+\)$", - "World-wide \(English title\)", - "\(literal English title\)", - "^International \(.+ title\)$", - "^International \(.+\) \(.+ title\)$", - ] - for regexp in regexps: - for type in types: - if re.compile(regexp).findall(type): - #print(types[type], type) - self['internationalTitle'] = types[type] - break - if 'internationalTitle' in self: - break - def cleanup_title(title): if title.startswith('"') and title.endswith('"'): title = title[1:-1] @@ -464,44 +364,40 @@ class Imdb(SiteParser): title = re.sub('\(\#[.\d]+\)', '', title) return title.strip() - for t in ('title', 'internationalTitle'): + for t in ('title', 'originalTitle'): if t in self: self[t] = cleanup_title(self[t]) - if 'internationalTitle' in self and \ - self.get('title', '').lower() == self['internationalTitle'].lower(): - del self['internationalTitle'] - if 'alternativeTitles' in self: alt = {} for t in self['alternativeTitles']: title = cleanup_title(t[1]) - if title not in (self.get('title'), self.get('internationalTitle')): + if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()): if title not in alt: alt[title] = [] for c in t[0].split('/'): - if not '(working title)' in c: - c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip() - if c: - alt[title].append(c) + for cleanup in ('International', '(working title)', 'World-wide'): + c = c.replace(cleanup, '') + c = c.split('(')[0].strip() + if c: + alt[title].append(c) self['alternativeTitles'] = [] for t in sorted(alt, key=lambda a: sorted(alt[a])): - countries = sorted([normalize_country_name(c) or c for c in alt[t]]) + countries = sorted(set([normalize_country_name(c) or c for c in alt[t]])) self['alternativeTitles'].append((t, countries)) if not self['alternativeTitles']: del self['alternativeTitles'] - if 'internationalTitle' in self: - self['originalTitle'] = self['title'] - self['title'] = self.pop('internationalTitle') - if 'runtime' in self and self['runtime']: - if 'min' in self['runtime']: base=60 - else: base=1 + if 'min' in self['runtime']: + base = 60 + else: + base = 1 self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base if 'runtime' in self and not self['runtime']: del self['runtime'] - if 'votes' in self: self['votes'] = self['votes'].replace(',', '') + if 'votes' in self: + self['votes'] = self['votes'].replace(',', '') if 'cast' in self: if isinstance(self['cast'][0], string_types): @@ -829,7 +725,7 @@ def get_episodes(imdbId, season=None): url += '?season=%d' % season data = cache.read_url(url) for e in re.compile('

.*?
S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data): - episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0] + episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0] else: data = cache.read_url(url) match = re.compile('Season (\d+)').findall(data)