use six to support python 2 and 3

2014-09-30 21:04:46 +02:00 · 2014-09-30 21:04:46 +02:00 · d4d09b56b6
commit d4d09b56b6
parent 1b1dcf1c58
28 changed files with 1730 additions and 1678 deletions
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -1,23 +1,27 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
-import urllib
+from __future__ import print_function
+
 import re
 import time
 import unicodedata

-import ox
-from ox import find_re, strip_tags
-import ox.cache
+from six.moves import urllib
+from six import string_types

-from siteparser import SiteParser
-import duckduckgo

+from .. import find_re, strip_tags, decode_html
+from .. import cache
+
+
+from . siteparser import SiteParser
+from . import duckduckgo
 from ..utils import datetime
 from ..geo import normalize_country_name

-def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
+def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
    headers = headers.copy()
-    return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
+    return cache.read_url(url, data, headers, timeout, unicode=unicode)

 def get_url(id):
    return "http://www.imdb.com/title/tt%s/" % id
@ -49,7 +53,7 @@ class Imdb(SiteParser):
            'page': 'business',
            're': [
                '<h5>Budget</h5>\s*?\$(.*?)<br',
-                lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
+                lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
            ],
            'type': 'int'
        },
@ -211,7 +215,7 @@ class Imdb(SiteParser):
            'page': 'releaseinfo',
            're': [
                '<td class="release_date">(.*?)</td>',
-                ox.strip_tags,
+                strip_tags,
            ],
            'type': 'list'
        },
@ -326,7 +330,7 @@ class Imdb(SiteParser):

        if 'alternativeTitles' in self:
            if len(self['alternativeTitles']) == 2 and \
-               isinstance(self['alternativeTitles'][0], basestring):
+               isinstance(self['alternativeTitles'][0], string_types):
               self['alternativeTitles'] = [self['alternativeTitles']]

        #normalize country names
@ -472,7 +476,7 @@ class Imdb(SiteParser):
                            if c:
                                alt[title].append(c)
            self['alternativeTitles'] = []
-            for t in sorted(alt, lambda a, b: cmp(sorted(alt[a]), sorted(alt[b]))):
+            for t in sorted(alt, key=lambda a: sorted(alt[a])):
                if alt[t]:
                    countries = sorted([normalize_country_name(c) or c for c in alt[t]])
                    self['alternativeTitles'].append((t, countries))
@ -492,7 +496,7 @@ class Imdb(SiteParser):
        if 'votes' in self: self['votes'] = self['votes'].replace(',', '')

        if 'cast' in self:
-            if isinstance(self['cast'][0], basestring):
+            if isinstance(self['cast'][0], string_types):
                self['cast'] = [self['cast']]
            self['actor'] = [c[0] for c in self['cast']]
            def cleanup_character(c):
@ -503,10 +507,12 @@ class Imdb(SiteParser):

        if 'connections' in self:
            cc={}
-            if len(self['connections']) == 3 and isinstance(self['connections'][0], basestring):
+            if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
                self['connections'] = [self['connections']]
            for rel, data, _ in self['connections']:
-                #cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
+                if isinstance(rel, bytes):
+                    rel = rel.decode('utf-8')
+                #cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
                def get_conn(c):
                    r = {
                        'id': c[0],
@ -516,14 +522,14 @@ class Imdb(SiteParser):
                    if len(description) == 2 and description[-1].strip() != '-':
                        r['description'] = description[-1].strip()
                    return r
-                cc[unicode(rel)] = map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data))
+                cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))


            self['connections'] = cc

        for key in ('country', 'genre'):
            if key in self:
-                self[key] = filter(lambda x: x.lower() != 'home', self[key])
+                self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
        #0092999
        if '_director' in self:
            if 'series' in self or 'isSeries' in self:
@ -590,8 +596,8 @@ class Imdb(SiteParser):
            if key in self:
                if isinstance(self[key][0], list):
                    self[key] = [i[0] for i in self[key] if i]
-                self[key] = sorted(list(set(self[key])),
-                                   lambda a, b: self[key].index(a) - self[key].index(b))
+                self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
+

        if 'budget' in self and 'gross' in self:
            self['profit'] = self['gross'] - self['budget']
@ -655,7 +661,7 @@ def get_movie_by_title(title, timeout=-1):
    u'0866567'
    '''
    params = {'s':'tt','q': title}
-    if isinstance(title, unicode):
+    if not isinstance(title, bytes):
        try:
            params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
        except:
@ -731,7 +737,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
    if year:
        params['q'] = u'"%s (%s)" %s' % (title, year, director)
    google_query = "site:imdb.com %s" % params['q']
-    if isinstance(params['q'], unicode):
+    if not isinstance(params['q'], bytes):
        try:
            params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
        except:
@ -775,7 +781,7 @@ def get_movie_poster(imdbId):
    info = ImdbCombined(imdbId)
    if 'posterId' in info:
        url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
-        data = read_url(url)
+        data = read_url(url).decode('utf-8', 'ignore')
        poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
        return poster
    elif 'series' in info:
@ -787,11 +793,11 @@ def get_episodes(imdbId, season=None):
    url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
    if season:
        url += '?season=%d' % season
-        data = ox.cache.read_url(url)
+        data = cache.read_url(url)
        for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
            episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
    else:
-        data = ox.cache.read_url(url)
+        data = cache.read_url(url)
        match = re.compile('<strong>Season (\d+)</strong>').findall(data)
        if match:
            for season in range(1, int(match[0]) + 1):
@ -800,7 +806,7 @@ def get_episodes(imdbId, season=None):

 def max_votes():
    url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
-    data = ox.cache.read_url(url)
+    data = cache.read_url(url)
    votes = max([int(v.replace(',', ''))
        for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
    return votes
@ -810,6 +816,6 @@ def guess(title, director='', timeout=-1):

 if __name__ == "__main__":
    import json
-    print json.dumps(Imdb('0306414'), indent=2)
+    print(json.dumps(Imdb('0306414'), indent=2))
    #print json.dumps(Imdb('0133093'), indent=2)