From 35534254c348362205c92fc6ff483d084a863e55 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Wed, 2 Aug 2017 16:48:22 +0200
Subject: [PATCH] simple title detection (imdb)

---
 ox/net.py      |   6 +-
 ox/web/imdb.py | 152 ++++++++-----------------------------------------
 2 files changed, 27 insertions(+), 131 deletions(-)
diff --git a/ox/net.py b/ox/net.py
index 485fd3f..02c7156 100644
--- a/ox/net.py
+++ b/ox/net.py
@@ -16,11 +16,11 @@ from chardet.universaldetector import UniversalDetector
 DEBUG = False
 # Default headers for HTTP requests.
 DEFAULT_HEADERS = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-    'Accept-Language': 'en-us,en;q=0.5',
-    'Accept-Encoding': 'gzip'
+    'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
+    'Accept-Encoding': 'gzip',
 }
 
 def status(url, data=None, headers=None):
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index 2fa3024..cf93bef 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -20,6 +20,8 @@ from ..geo import normalize_country_name
 
 def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
     headers = headers.copy()
+    # https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
+    headers['X-Forwarded-For'] = '72.21.206.80'
     return cache.read_url(url, data, headers, timeout, unicode=unicode)
 
 def get_url(id):
@@ -174,6 +176,11 @@ class Imdb(SiteParser):
             ],
             'type': 'list'
         },
+        'originalTitle': {
+            'page': 'combined',
+            're': '<span class="title-extra">(.*?) <i>\(original title\)</i>',
+            'type': 'string'
+        },
         'summary': {
             'page': 'plotsummary',
             're': '<p class="plotSummary">(.*?)<\/p>',
@@ -318,14 +325,14 @@ class Imdb(SiteParser):
     }
 
     def read_url(self, url, timeout):
-        if not url in self._cache:
+        if url not in self._cache:
             self._cache[url] = read_url(url, timeout=timeout, unicode=True)
         return self._cache[url]
 
     def __init__(self, id, timeout=-1):
         # use akas.imdb.com to always get original title:
         # http://www.imdb.com/help/show_leaf?titlelanguagedisplay
-        self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
+        self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
         super(Imdb, self).__init__(timeout)
 
         url = self.baseUrl + 'combined' 
@@ -349,113 +356,6 @@ class Imdb(SiteParser):
         if 'sound' in self:
             self['sound'] = list(set(self['sound']))
 
-        types = {}
-        stop_words = [ 
-            'alternative spelling',
-            'alternative title',
-            'alternative transliteration',
-            'closing credits title',
-            'complete title',
-            'IMAX version',
-            'informal short title',
-            'International (Spanish title)',
-            'Japan (imdb display title)',
-            'longer version',
-            'new title',
-            'original subtitled version',
-            'pre-release title',
-            'promotional abbreviation',
-            'recut version',
-            'reissue title',
-            'restored version',
-            'script title',
-            'short title',
-            '(subtitle)',
-            'TV title',
-            'working title',
-            'World-wide (Spanish title)',
-        ]
-        #ignore english japanese titles
-        #for movies that are not only from japan
-        if ['Japan'] != self.get('country', []):
-            stop_words += [
-                'Japan (English title)'
-            ]
-        for t in self.get('alternativeTitles', []):
-            for type in t[0].split('/'):
-                type = type.strip()
-                stop_word = False
-                for key in stop_words:
-                    if key in type:
-                        stop_word = True
-                        break
-                if not stop_word:
-                    if not type in types:
-                        types[type] = []
-                    types[type].append(t[1])
-        titles = {}
-        for type in types:
-            for title in types[type]:
-                if not title in titles:
-                    titles[title] = []
-                titles[title].append(type)
-        def select_title(type):
-            title = types[type][0]
-            count = 0
-            if len(types[type]) > 1:
-                for t in types[type]:
-                    if len(titles[t]) > count:
-                        count = len(titles[t])
-                        title = t
-            return title
-
-        #FIXME: does work in python2.6, possible to import from __future__?
-        #types = {type: select_title(type) for type in types}
-        _types = {}
-        for type in types:
-            _types[type] = select_title(type)
-        types = _types
-
-        regexps = [
-            "^.+ \(imdb display title\) \(English title\)$",
-            "^USA \(imdb display title\)$",
-            "^International \(English title\)$",
-            "^International \(English title\)$",
-            "^UK \(imdb display title\)$",
-            "^International \(.+\) \(English title\)$",
-            "^World-wide \(English title\)$",
-        ]
-        if 'Hong Kong' in self.get('country', []):
-            regexps += [
-                "Hong Kong \(English title\)"
-            ]
-        english_countries = (
-            'USA', 'UK', 'United States', 'United Kingdom',
-            'Australia', 'New Zealand'
-        )
-        if not list(filter(lambda c: c in english_countries, self.get('country', []))):
-            regexps += [
-                "^[^(]+ \(English title\)$",
-                "^.+ \(.+\) \(English title\)$",
-                "^USA$",
-                "^UK$",
-                "^USA \(.+\)$",
-                "^UK \(.+\)$",
-                "^Australia \(.+\)$",
-                "World-wide \(English title\)",
-                "\(literal English title\)",
-                "^International \(.+ title\)$",
-                "^International \(.+\) \(.+ title\)$",
-            ]
-        for regexp in regexps:
-            for type in types:
-                if re.compile(regexp).findall(type):
-                    #print(types[type], type)
-                    self['internationalTitle'] = types[type]
-                    break
-            if 'internationalTitle' in self:
-                break
-
         def cleanup_title(title):
             if title.startswith('"') and title.endswith('"'):
                 title = title[1:-1]
@@ -464,44 +364,40 @@ class Imdb(SiteParser):
             title = re.sub('\(\#[.\d]+\)', '', title)
             return title.strip()
 
-        for t in ('title', 'internationalTitle'):
+        for t in ('title', 'originalTitle'):
             if t in self:
                 self[t] = cleanup_title(self[t])
 
-        if 'internationalTitle' in self and \
-            self.get('title', '').lower() == self['internationalTitle'].lower():
-            del self['internationalTitle']
-
         if 'alternativeTitles' in self:
             alt = {}
             for t in self['alternativeTitles']:
                 title = cleanup_title(t[1])
-                if title not in (self.get('title'), self.get('internationalTitle')):
+                if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
                     if title not in alt:
                         alt[title] = []
                     for c in t[0].split('/'):
-                        if not '(working title)' in c:
-                            c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
-                            if c:
-                                alt[title].append(c)
+                        for cleanup in ('International', '(working title)', 'World-wide'):
+                            c = c.replace(cleanup, '')
+                        c = c.split('(')[0].strip()
+                        if c:
+                            alt[title].append(c)
             self['alternativeTitles'] = []
             for t in sorted(alt, key=lambda a: sorted(alt[a])):
-                countries = sorted([normalize_country_name(c) or c for c in alt[t]])
+                countries = sorted(set([normalize_country_name(c) or c for c in alt[t]]))
                 self['alternativeTitles'].append((t, countries))
             if not self['alternativeTitles']:
                 del self['alternativeTitles']
 
-        if 'internationalTitle' in self:
-            self['originalTitle'] = self['title']
-            self['title'] = self.pop('internationalTitle')
-
         if 'runtime' in self and self['runtime']:
-            if 'min' in self['runtime']: base=60
-            else: base=1
+            if 'min' in self['runtime']:
+                base = 60
+            else:
+                base = 1
             self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
         if 'runtime' in self and not self['runtime']:
             del self['runtime']
-        if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
+        if 'votes' in self:
+            self['votes'] = self['votes'].replace(',', '')
 
         if 'cast' in self:
             if isinstance(self['cast'][0], string_types):
@@ -829,7 +725,7 @@ def get_episodes(imdbId, season=None):
         url += '?season=%d' % season
         data = cache.read_url(url)
         for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
-            episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
+            episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
     else:
         data = cache.read_url(url)
         match = re.compile('<strong>Season (\d+)</strong>').findall(data)