simple title detection (imdb)

2017-08-02 16:48:22 +02:00 · 2017-08-02 16:48:22 +02:00 · 35534254c3
commit 35534254c3
parent b7b4b09f0f
2 changed files with 27 additions and 131 deletions
--- a/ox/net.py
+++ b/ox/net.py
@ -16,11 +16,11 @@ from chardet.universaldetector import UniversalDetector
 DEBUG = False
 # Default headers for HTTP requests.
 DEFAULT_HEADERS = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-    'Accept-Language': 'en-us,en;q=0.5',
-    'Accept-Encoding': 'gzip'
+    'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
+    'Accept-Encoding': 'gzip',
 }

 def status(url, data=None, headers=None):
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -20,6 +20,8 @@ from ..geo import normalize_country_name

 def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
    headers = headers.copy()
+    # https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
+    headers['X-Forwarded-For'] = '72.21.206.80'
    return cache.read_url(url, data, headers, timeout, unicode=unicode)

 def get_url(id):
@ -174,6 +176,11 @@ class Imdb(SiteParser):
            ],
            'type': 'list'
        },
+        'originalTitle': {
+            'page': 'combined',
+            're': '<span class="title-extra">(.*?) <i>\(original title\)</i>',
+            'type': 'string'
+        },
        'summary': {
            'page': 'plotsummary',
            're': '<p class="plotSummary">(.*?)<\/p>',
@ -318,14 +325,14 @@ class Imdb(SiteParser):
    }

    def read_url(self, url, timeout):
-        if not url in self._cache:
+        if url not in self._cache:
            self._cache[url] = read_url(url, timeout=timeout, unicode=True)
        return self._cache[url]

    def __init__(self, id, timeout=-1):
        # use akas.imdb.com to always get original title:
        # http://www.imdb.com/help/show_leaf?titlelanguagedisplay
-        self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
+        self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
        super(Imdb, self).__init__(timeout)

        url = self.baseUrl + 'combined' 
@ -349,113 +356,6 @@ class Imdb(SiteParser):
        if 'sound' in self:
            self['sound'] = list(set(self['sound']))

-        types = {}
-        stop_words = [ 
-            'alternative spelling',
-            'alternative title',
-            'alternative transliteration',
-            'closing credits title',
-            'complete title',
-            'IMAX version',
-            'informal short title',
-            'International (Spanish title)',
-            'Japan (imdb display title)',
-            'longer version',
-            'new title',
-            'original subtitled version',
-            'pre-release title',
-            'promotional abbreviation',
-            'recut version',
-            'reissue title',
-            'restored version',
-            'script title',
-            'short title',
-            '(subtitle)',
-            'TV title',
-            'working title',
-            'World-wide (Spanish title)',
-        ]
-        #ignore english japanese titles
-        #for movies that are not only from japan
-        if ['Japan'] != self.get('country', []):
-            stop_words += [
-                'Japan (English title)'
-            ]
-        for t in self.get('alternativeTitles', []):
-            for type in t[0].split('/'):
-                type = type.strip()
-                stop_word = False
-                for key in stop_words:
-                    if key in type:
-                        stop_word = True
-                        break
-                if not stop_word:
-                    if not type in types:
-                        types[type] = []
-                    types[type].append(t[1])
-        titles = {}
-        for type in types:
-            for title in types[type]:
-                if not title in titles:
-                    titles[title] = []
-                titles[title].append(type)
-        def select_title(type):
-            title = types[type][0]
-            count = 0
-            if len(types[type]) > 1:
-                for t in types[type]:
-                    if len(titles[t]) > count:
-                        count = len(titles[t])
-                        title = t
-            return title
-
-        #FIXME: does work in python2.6, possible to import from __future__?
-        #types = {type: select_title(type) for type in types}
-        _types = {}
-        for type in types:
-            _types[type] = select_title(type)
-        types = _types
-
-        regexps = [
-            "^.+ \(imdb display title\) \(English title\)$",
-            "^USA \(imdb display title\)$",
-            "^International \(English title\)$",
-            "^International \(English title\)$",
-            "^UK \(imdb display title\)$",
-            "^International \(.+\) \(English title\)$",
-            "^World-wide \(English title\)$",
-        ]
-        if 'Hong Kong' in self.get('country', []):
-            regexps += [
-                "Hong Kong \(English title\)"
-            ]
-        english_countries = (
-            'USA', 'UK', 'United States', 'United Kingdom',
-            'Australia', 'New Zealand'
-        )
-        if not list(filter(lambda c: c in english_countries, self.get('country', []))):
-            regexps += [
-                "^[^(]+ \(English title\)$",
-                "^.+ \(.+\) \(English title\)$",
-                "^USA$",
-                "^UK$",
-                "^USA \(.+\)$",
-                "^UK \(.+\)$",
-                "^Australia \(.+\)$",
-                "World-wide \(English title\)",
-                "\(literal English title\)",
-                "^International \(.+ title\)$",
-                "^International \(.+\) \(.+ title\)$",
-            ]
-        for regexp in regexps:
-            for type in types:
-                if re.compile(regexp).findall(type):
-                    #print(types[type], type)
-                    self['internationalTitle'] = types[type]
-                    break
-            if 'internationalTitle' in self:
-                break
-
        def cleanup_title(title):
            if title.startswith('"') and title.endswith('"'):
                title = title[1:-1]
@ -464,44 +364,40 @@ class Imdb(SiteParser):
            title = re.sub('\(\#[.\d]+\)', '', title)
            return title.strip()

-        for t in ('title', 'internationalTitle'):
+        for t in ('title', 'originalTitle'):
            if t in self:
                self[t] = cleanup_title(self[t])

-        if 'internationalTitle' in self and \
-            self.get('title', '').lower() == self['internationalTitle'].lower():
-            del self['internationalTitle']
-
        if 'alternativeTitles' in self:
            alt = {}
            for t in self['alternativeTitles']:
                title = cleanup_title(t[1])
-                if title not in (self.get('title'), self.get('internationalTitle')):
+                if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
                    if title not in alt:
                        alt[title] = []
                    for c in t[0].split('/'):
-                        if not '(working title)' in c:
-                            c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
+                        for cleanup in ('International', '(working title)', 'World-wide'):
+                            c = c.replace(cleanup, '')
+                        c = c.split('(')[0].strip()
                        if c:
                            alt[title].append(c)
            self['alternativeTitles'] = []
            for t in sorted(alt, key=lambda a: sorted(alt[a])):
-                countries = sorted([normalize_country_name(c) or c for c in alt[t]])
+                countries = sorted(set([normalize_country_name(c) or c for c in alt[t]]))
                self['alternativeTitles'].append((t, countries))
            if not self['alternativeTitles']:
                del self['alternativeTitles']

-        if 'internationalTitle' in self:
-            self['originalTitle'] = self['title']
-            self['title'] = self.pop('internationalTitle')
-
        if 'runtime' in self and self['runtime']:
-            if 'min' in self['runtime']: base=60
-            else: base=1
+            if 'min' in self['runtime']:
+                base = 60
+            else:
+                base = 1
            self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
        if 'runtime' in self and not self['runtime']:
            del self['runtime']
-        if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
+        if 'votes' in self:
+            self['votes'] = self['votes'].replace(',', '')

        if 'cast' in self:
            if isinstance(self['cast'][0], string_types):