simple title detection (imdb)

2017-08-02 16:48:22 +02:00 · 2017-08-02 16:48:22 +02:00 · 35534254c3
commit 35534254c3
parent b7b4b09f0f
2 changed files with 27 additions and 131 deletions
--- a/ox/net.py
+++ b/ox/net.py
@ -16,11 +16,11 @@ from chardet.universaldetector import UniversalDetector
 DEBUG = False
 # Default headers for HTTP requests.
 DEFAULT_HEADERS = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-    'Accept-Language': 'en-us,en;q=0.5',
+    'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
-    'Accept-Encoding': 'gzip'
+    'Accept-Encoding': 'gzip',
 }
 def status(url, data=None, headers=None):
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -20,6 +20,8 @@ from ..geo import normalize_country_name
 def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
    headers = headers.copy()
    # https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
    headers['X-Forwarded-For'] = '72.21.206.80'
    return cache.read_url(url, data, headers, timeout, unicode=unicode)
 def get_url(id):
@ -174,6 +176,11 @@ class Imdb(SiteParser):
            ],
            'type': 'list'
        },
        'originalTitle': {
            'page': 'combined',
            're': '<span class="title-extra">(.*?) <i>\(original title\)</i>',
            'type': 'string'
        },
        'summary': {
            'page': 'plotsummary',
            're': '<p class="plotSummary">(.*?)<\/p>',
@ -318,14 +325,14 @@ class Imdb(SiteParser):
    }
    def read_url(self, url, timeout):
-        if not url in self._cache:
+        if url not in self._cache:
            self._cache[url] = read_url(url, timeout=timeout, unicode=True)
        return self._cache[url]
    def __init__(self, id, timeout=-1):
        # use akas.imdb.com to always get original title:
        # http://www.imdb.com/help/show_leaf?titlelanguagedisplay
-        self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
+        self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
        super(Imdb, self).__init__(timeout)
        url = self.baseUrl + 'combined' 
@ -349,113 +356,6 @@ class Imdb(SiteParser):
        if 'sound' in self:
            self['sound'] = list(set(self['sound']))
        types = {}
        stop_words = [ 
            'alternative spelling',
            'alternative title',
            'alternative transliteration',
            'closing credits title',
            'complete title',
            'IMAX version',
            'informal short title',
            'International (Spanish title)',
            'Japan (imdb display title)',
            'longer version',
            'new title',
            'original subtitled version',
            'pre-release title',
            'promotional abbreviation',
            'recut version',
            'reissue title',
            'restored version',
            'script title',
            'short title',
            '(subtitle)',
            'TV title',
            'working title',
            'World-wide (Spanish title)',
        ]
        #ignore english japanese titles
        #for movies that are not only from japan
        if ['Japan'] != self.get('country', []):
            stop_words += [
                'Japan (English title)'
            ]
        for t in self.get('alternativeTitles', []):
            for type in t[0].split('/'):
                type = type.strip()
                stop_word = False
                for key in stop_words:
                    if key in type:
                        stop_word = True
                        break
                if not stop_word:
                    if not type in types:
                        types[type] = []
                    types[type].append(t[1])
        titles = {}
        for type in types:
            for title in types[type]:
                if not title in titles:
                    titles[title] = []
                titles[title].append(type)
        def select_title(type):
            title = types[type][0]
            count = 0
            if len(types[type]) > 1:
                for t in types[type]:
                    if len(titles[t]) > count:
                        count = len(titles[t])
                        title = t
            return title
        #FIXME: does work in python2.6, possible to import from __future__?
        #types = {type: select_title(type) for type in types}
        _types = {}
        for type in types:
            _types[type] = select_title(type)
        types = _types
        regexps = [
            "^.+ \(imdb display title\) \(English title\)$",
            "^USA \(imdb display title\)$",
            "^International \(English title\)$",
            "^International \(English title\)$",
            "^UK \(imdb display title\)$",
            "^International \(.+\) \(English title\)$",
            "^World-wide \(English title\)$",
        ]
        if 'Hong Kong' in self.get('country', []):
            regexps += [
                "Hong Kong \(English title\)"
            ]
        english_countries = (
            'USA', 'UK', 'United States', 'United Kingdom',
            'Australia', 'New Zealand'
        )
        if not list(filter(lambda c: c in english_countries, self.get('country', []))):
            regexps += [
                "^[^(]+ \(English title\)$",
                "^.+ \(.+\) \(English title\)$",
                "^USA$",
                "^UK$",
                "^USA \(.+\)$",
                "^UK \(.+\)$",
                "^Australia \(.+\)$",
                "World-wide \(English title\)",
                "\(literal English title\)",
                "^International \(.+ title\)$",
                "^International \(.+\) \(.+ title\)$",
            ]
        for regexp in regexps:
            for type in types:
                if re.compile(regexp).findall(type):
                    #print(types[type], type)
                    self['internationalTitle'] = types[type]
                    break
            if 'internationalTitle' in self:
                break
        def cleanup_title(title):
            if title.startswith('"') and title.endswith('"'):
                title = title[1:-1]
@ -464,44 +364,40 @@ class Imdb(SiteParser):
            title = re.sub('\(\#[.\d]+\)', '', title)
            return title.strip()
-        for t in ('title', 'internationalTitle'):
+        for t in ('title', 'originalTitle'):
            if t in self:
                self[t] = cleanup_title(self[t])
        if 'internationalTitle' in self and \
            self.get('title', '').lower() == self['internationalTitle'].lower():
            del self['internationalTitle']
        if 'alternativeTitles' in self:
            alt = {}
            for t in self['alternativeTitles']:
                title = cleanup_title(t[1])
-                if title not in (self.get('title'), self.get('internationalTitle')):
+                if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
                    if title not in alt:
                        alt[title] = []
                    for c in t[0].split('/'):
-                        if not '(working title)' in c:
+                        for cleanup in ('International', '(working title)', 'World-wide'):
-                            c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
+                            c = c.replace(cleanup, '')
-                            if c:
+                        c = c.split('(')[0].strip()
-                                alt[title].append(c)
+                        if c:
                            alt[title].append(c)
            self['alternativeTitles'] = []
            for t in sorted(alt, key=lambda a: sorted(alt[a])):
-                countries = sorted([normalize_country_name(c) or c for c in alt[t]])
+                countries = sorted(set([normalize_country_name(c) or c for c in alt[t]]))
                self['alternativeTitles'].append((t, countries))
            if not self['alternativeTitles']:
                del self['alternativeTitles']
        if 'internationalTitle' in self:
            self['originalTitle'] = self['title']
            self['title'] = self.pop('internationalTitle')
        if 'runtime' in self and self['runtime']:
-            if 'min' in self['runtime']: base=60
+            if 'min' in self['runtime']:
-            else: base=1
+                base = 60
            else:
                base = 1
            self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
        if 'runtime' in self and not self['runtime']:
            del self['runtime']
-        if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
+        if 'votes' in self:
            self['votes'] = self['votes'].replace(',', '')
        if 'cast' in self:
            if isinstance(self['cast'][0], string_types):
@ -829,7 +725,7 @@ def get_episodes(imdbId, season=None):
        url += '?season=%d' % season
        data = cache.read_url(url)
        for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
-            episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
+            episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
    else:
        data = cache.read_url(url)
        match = re.compile('<strong>Season (\d+)</strong>').findall(data)