use six to support python 2 and 3

2014-09-30 21:04:46 +02:00 · 2014-09-30 21:04:46 +02:00 · d4d09b56b6
commit d4d09b56b6
parent 1b1dcf1c58
28 changed files with 1730 additions and 1678 deletions
--- a/ox/web/init.py
+++ b/ox/web/init.py
@ -2,8 +2,8 @@
 # encoding: utf-8
 __version__ = '1.0.0'

-import imdb
-import wikipedia
-import google
-import piratecinema
-import oxdb
+from . import imdb
+from . import wikipedia
+from . import google
+from . import piratecinema
+from . import oxdb
--- a/ox/web/allmovie.py
+++ b/ox/web/allmovie.py
@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 import re
-import time

 from ox import strip_tags, find_re
 from ox.cache import read_url
--- a/ox/web/amazon.py
+++ b/ox/web/amazon.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 import re
-from urllib import quote
+from six.moves.urllib.parse import quote

 from ox import find_re, strip_tags, decode_html
 from ox.cache import read_url
--- a/ox/web/arsenalberlin.py
+++ b/ox/web/arsenalberlin.py
@ -1,14 +1,11 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
-from datetime import datetime
-from urllib import urlencode
 import json
 import os
 import re

-from ox import find_re, strip_tags, decode_html
+from ox import find_re, strip_tags
 from ox.cache import read_url
-from ox.net import open_url

 def get_data(id, language='en'):
    if language == 'de':
@ -57,7 +54,7 @@ def backup(filename):
            data = json.load(f)
    else:
        data = {}
-    start = ids and max(map(int, data)) or 1
+    start = max(map(int, data)) or 1
    for i in range(start, 11872):
        info = get_data(i)
        if info:
--- a/ox/web/criterion.py
+++ b/ox/web/criterion.py
@ -5,7 +5,7 @@ import re
 import ox.cache
 from ox.cache import read_url
 from ox.html import strip_tags
-from ox.text import find_re, remove_special_characters
+from ox.text import find_re

 import imdb

--- a/ox/web/dailymotion.py
+++ b/ox/web/dailymotion.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 import re
-from urllib import unquote
+from six.moves.urllib.parse import unquote
 from ox.cache import read_url


--- a/ox/web/duckduckgo.py
+++ b/ox/web/duckduckgo.py
@ -1,17 +1,17 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 import re
-import urllib
+
+from six.moves import urllib
 import ox
 from ox import strip_tags, decode_html
-from ox.utils import json
 from ox.cache import read_url


 def find(query, timeout=ox.cache.cache_timeout):
-    if isinstance(query, unicode):
+    if not isinstance(query, bytes):
        query = query.encode('utf-8')
-    params = urllib.urlencode({'q': query})
+    params = urllib.parse.urlencode({'q': query})
    url = 'http://duckduckgo.com/html/?' + params
    data = read_url(url, timeout=timeout).decode('utf-8')
    results = []
--- a/ox/web/google.py
+++ b/ox/web/google.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 import re
-import urllib
+from six.moves import urllib

 import ox
 from ox import strip_tags, decode_html
@ -13,9 +13,9 @@ def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIM
    return ox.cache.read_url(url, data, headers, timeout, unicode=True)

 def quote_plus(s):
-    if not isinstance(s, str):
+    if not isinstance(s, bytes):
        s = s.encode('utf-8')
-    return urllib.quote_plus(s)
+    return urllib.parse.quote_plus(s)

 def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
    """
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -1,23 +1,27 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
-import urllib
+from __future__ import print_function
+
 import re
 import time
 import unicodedata

-import ox
-from ox import find_re, strip_tags
-import ox.cache
+from six.moves import urllib
+from six import string_types

-from siteparser import SiteParser
-import duckduckgo

+from .. import find_re, strip_tags, decode_html
+from .. import cache
+
+
+from . siteparser import SiteParser
+from . import duckduckgo
 from ..utils import datetime
 from ..geo import normalize_country_name

-def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
+def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
    headers = headers.copy()
-    return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
+    return cache.read_url(url, data, headers, timeout, unicode=unicode)

 def get_url(id):
    return "http://www.imdb.com/title/tt%s/" % id
@ -49,7 +53,7 @@ class Imdb(SiteParser):
            'page': 'business',
            're': [
                '<h5>Budget</h5>\s*?\$(.*?)<br',
-                lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
+                lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
            ],
            'type': 'int'
        },
@ -211,7 +215,7 @@ class Imdb(SiteParser):
            'page': 'releaseinfo',
            're': [
                '<td class="release_date">(.*?)</td>',
-                ox.strip_tags,
+                strip_tags,
            ],
            'type': 'list'
        },
@ -326,7 +330,7 @@ class Imdb(SiteParser):

        if 'alternativeTitles' in self:
            if len(self['alternativeTitles']) == 2 and \
-               isinstance(self['alternativeTitles'][0], basestring):
+               isinstance(self['alternativeTitles'][0], string_types):
               self['alternativeTitles'] = [self['alternativeTitles']]

        #normalize country names
@ -472,7 +476,7 @@ class Imdb(SiteParser):
                            if c:
                                alt[title].append(c)
            self['alternativeTitles'] = []
-            for t in sorted(alt, lambda a, b: cmp(sorted(alt[a]), sorted(alt[b]))):
+            for t in sorted(alt, key=lambda a: sorted(alt[a])):
                if alt[t]:
                    countries = sorted([normalize_country_name(c) or c for c in alt[t]])
                    self['alternativeTitles'].append((t, countries))
@ -492,7 +496,7 @@ class Imdb(SiteParser):
        if 'votes' in self: self['votes'] = self['votes'].replace(',', '')

        if 'cast' in self:
-            if isinstance(self['cast'][0], basestring):
+            if isinstance(self['cast'][0], string_types):
                self['cast'] = [self['cast']]
            self['actor'] = [c[0] for c in self['cast']]
            def cleanup_character(c):
@ -503,10 +507,12 @@ class Imdb(SiteParser):

        if 'connections' in self:
            cc={}
-            if len(self['connections']) == 3 and isinstance(self['connections'][0], basestring):
+            if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
                self['connections'] = [self['connections']]
            for rel, data, _ in self['connections']:
-                #cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
+                if isinstance(rel, bytes):
+                    rel = rel.decode('utf-8')
+                #cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
                def get_conn(c):
                    r = {
                        'id': c[0],
@ -516,14 +522,14 @@ class Imdb(SiteParser):
                    if len(description) == 2 and description[-1].strip() != '-':
                        r['description'] = description[-1].strip()
                    return r
-                cc[unicode(rel)] = map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data))
+                cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))


            self['connections'] = cc

        for key in ('country', 'genre'):
            if key in self:
-                self[key] = filter(lambda x: x.lower() != 'home', self[key])
+                self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
        #0092999
        if '_director' in self:
            if 'series' in self or 'isSeries' in self:
@ -590,8 +596,8 @@ class Imdb(SiteParser):
            if key in self:
                if isinstance(self[key][0], list):
                    self[key] = [i[0] for i in self[key] if i]
-                self[key] = sorted(list(set(self[key])),
-                                   lambda a, b: self[key].index(a) - self[key].index(b))
+                self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
+

        if 'budget' in self and 'gross' in self:
            self['profit'] = self['gross'] - self['budget']
@ -655,7 +661,7 @@ def get_movie_by_title(title, timeout=-1):
    u'0866567'
    '''
    params = {'s':'tt','q': title}
-    if isinstance(title, unicode):
+    if not isinstance(title, bytes):
        try:
            params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
        except:
@ -731,7 +737,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
    if year:
        params['q'] = u'"%s (%s)" %s' % (title, year, director)
    google_query = "site:imdb.com %s" % params['q']
-    if isinstance(params['q'], unicode):
+    if not isinstance(params['q'], bytes):
        try:
            params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
        except:
@ -775,7 +781,7 @@ def get_movie_poster(imdbId):
    info = ImdbCombined(imdbId)
    if 'posterId' in info:
        url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
-        data = read_url(url)
+        data = read_url(url).decode('utf-8', 'ignore')
        poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
        return poster
    elif 'series' in info:
@ -787,11 +793,11 @@ def get_episodes(imdbId, season=None):
    url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
    if season:
        url += '?season=%d' % season
-        data = ox.cache.read_url(url)
+        data = cache.read_url(url)
        for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
            episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
    else:
-        data = ox.cache.read_url(url)
+        data = cache.read_url(url)
        match = re.compile('<strong>Season (\d+)</strong>').findall(data)
        if match:
            for season in range(1, int(match[0]) + 1):
@ -800,7 +806,7 @@ def get_episodes(imdbId, season=None):

 def max_votes():
    url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
-    data = ox.cache.read_url(url)
+    data = cache.read_url(url)
    votes = max([int(v.replace(',', ''))
        for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
    return votes
@ -810,6 +816,6 @@ def guess(title, director='', timeout=-1):

 if __name__ == "__main__":
    import json
-    print json.dumps(Imdb('0306414'), indent=2)
+    print(json.dumps(Imdb('0306414'), indent=2))
    #print json.dumps(Imdb('0133093'), indent=2)

--- a/ox/web/piratecinema.py
+++ b/ox/web/piratecinema.py
@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
+from __future__ import print_function
+
 import re
 from ox.net import read_url

@ -13,5 +15,5 @@ def get_poster_url(id):
    return ''

 if __name__ == '__main__':
-    print get_poster_url('0749451')
+    print(get_poster_url('0749451'))

--- a/ox/web/siteparser.py
+++ b/ox/web/siteparser.py
@ -2,22 +2,24 @@
 # vi:si:et:sw=4:sts=4:ts=4
 import re

+from six import string_types
+
 from ..cache import read_url
-from .. import strip_tags, decode_html
+from .. import decode_html
 from ..utils import datetime


 def cleanup(key, data, data_type):
    if data:
-        if isinstance(data[0], basestring):
+        if isinstance(data[0], string_types):
            #FIXME: some types need strip_tags
            #data = [strip_tags(decode_html(p)).strip() for p in data]
            data = [decode_html(p).strip() for p in data]
        elif isinstance(data[0], list) or isinstance(data[0], tuple):
            data = [cleanup(key, p, data_type) for p in data]
-        while len(data) == 1 and not isinstance(data, basestring):
+        while len(data) == 1 and not isinstance(data, string_types):
            data = data[0]
-        if data_type == 'list' and isinstance(data, basestring):
+        if data_type == 'list' and isinstance(data, string_types):
            data = [data, ]
    elif data_type != 'list':
        data = ''
@ -40,7 +42,7 @@ class SiteParser(dict):
        for key in self.regex:
            url = self.get_url(self.regex[key]['page'])
            data = self.read_url(url, timeout)
-            if isinstance(self.regex[key]['re'], basestring):
+            if isinstance(self.regex[key]['re'], string_types):
                data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
                data = cleanup(key, data, self.regex[key]['type'])
            elif callable(self.regex[key]['re']):
@ -51,7 +53,7 @@ class SiteParser(dict):
                        f = r
                    else:
                        f = re.compile(r, re.DOTALL).findall
-                    if isinstance(data, basestring):
+                    if isinstance(data, string_types):
                        data = f(data)
                    else:
                        data = [f(d) for d in data]
--- a/ox/web/wikipedia.py
+++ b/ox/web/wikipedia.py
@ -1,11 +1,14 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
+from __future__ import print_function
+
 import re
-from urllib import urlencode
+
+from six.moves import urllib

 from ox.utils import json
 from ox.cache import read_url
-from ox import find_re, decode_html
+from ox import find_re


 def get_id(url):
@ -138,11 +141,11 @@ def get_allmovie_id(wikipedia_url):
 def find(query, max_results=10):
    query = {'action': 'query', 'list':'search', 'format': 'json',
             'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
-    url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
+    url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
    data = read_url(url)
    if not data:
        data  = read_url(url, timeout=0)
-    result = json.loads(data)
+    result = json.loads(data.decode('utf-8'))
    results = []
    if result and 'query' in result:
        for r in result['query']['search']: