use six to support python 2 and 3

This commit is contained in:
j 2014-09-30 21:04:46 +02:00
commit d4d09b56b6
28 changed files with 1730 additions and 1678 deletions

View file

@ -2,8 +2,8 @@
# encoding: utf-8
__version__ = '1.0.0'
import imdb
import wikipedia
import google
import piratecinema
import oxdb
from . import imdb
from . import wikipedia
from . import google
from . import piratecinema
from . import oxdb

View file

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time
from ox import strip_tags, find_re
from ox.cache import read_url

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from urllib import quote
from six.moves.urllib.parse import quote
from ox import find_re, strip_tags, decode_html
from ox.cache import read_url

View file

@ -1,14 +1,11 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
from urllib import urlencode
import json
import os
import re
from ox import find_re, strip_tags, decode_html
from ox import find_re, strip_tags
from ox.cache import read_url
from ox.net import open_url
def get_data(id, language='en'):
if language == 'de':
@ -57,7 +54,7 @@ def backup(filename):
data = json.load(f)
else:
data = {}
start = ids and max(map(int, data)) or 1
start = max(map(int, data)) or 1
for i in range(start, 11872):
info = get_data(i)
if info:

View file

@ -5,7 +5,7 @@ import re
import ox.cache
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import find_re, remove_special_characters
from ox.text import find_re
import imdb

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from urllib import unquote
from six.moves.urllib.parse import unquote
from ox.cache import read_url

View file

@ -1,17 +1,17 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import urllib
from six.moves import urllib
import ox
from ox import strip_tags, decode_html
from ox.utils import json
from ox.cache import read_url
def find(query, timeout=ox.cache.cache_timeout):
if isinstance(query, unicode):
if not isinstance(query, bytes):
query = query.encode('utf-8')
params = urllib.urlencode({'q': query})
params = urllib.parse.urlencode({'q': query})
url = 'http://duckduckgo.com/html/?' + params
data = read_url(url, timeout=timeout).decode('utf-8')
results = []

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import urllib
from six.moves import urllib
import ox
from ox import strip_tags, decode_html
@ -13,9 +13,9 @@ def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIM
return ox.cache.read_url(url, data, headers, timeout, unicode=True)
def quote_plus(s):
if not isinstance(s, str):
if not isinstance(s, bytes):
s = s.encode('utf-8')
return urllib.quote_plus(s)
return urllib.parse.quote_plus(s)
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
"""

View file

@ -1,23 +1,27 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import urllib
from __future__ import print_function
import re
import time
import unicodedata
import ox
from ox import find_re, strip_tags
import ox.cache
from six.moves import urllib
from six import string_types
from siteparser import SiteParser
import duckduckgo
from .. import find_re, strip_tags, decode_html
from .. import cache
from . siteparser import SiteParser
from . import duckduckgo
from ..utils import datetime
from ..geo import normalize_country_name
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
return cache.read_url(url, data, headers, timeout, unicode=unicode)
def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id
@ -49,7 +53,7 @@ class Imdb(SiteParser):
'page': 'business',
're': [
'<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
],
'type': 'int'
},
@ -211,7 +215,7 @@ class Imdb(SiteParser):
'page': 'releaseinfo',
're': [
'<td class="release_date">(.*?)</td>',
ox.strip_tags,
strip_tags,
],
'type': 'list'
},
@ -326,7 +330,7 @@ class Imdb(SiteParser):
if 'alternativeTitles' in self:
if len(self['alternativeTitles']) == 2 and \
isinstance(self['alternativeTitles'][0], basestring):
isinstance(self['alternativeTitles'][0], string_types):
self['alternativeTitles'] = [self['alternativeTitles']]
#normalize country names
@ -472,7 +476,7 @@ class Imdb(SiteParser):
if c:
alt[title].append(c)
self['alternativeTitles'] = []
for t in sorted(alt, lambda a, b: cmp(sorted(alt[a]), sorted(alt[b]))):
for t in sorted(alt, key=lambda a: sorted(alt[a])):
if alt[t]:
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
self['alternativeTitles'].append((t, countries))
@ -492,7 +496,7 @@ class Imdb(SiteParser):
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'cast' in self:
if isinstance(self['cast'][0], basestring):
if isinstance(self['cast'][0], string_types):
self['cast'] = [self['cast']]
self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c):
@ -503,10 +507,12 @@ class Imdb(SiteParser):
if 'connections' in self:
cc={}
if len(self['connections']) == 3 and isinstance(self['connections'][0], basestring):
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
self['connections'] = [self['connections']]
for rel, data, _ in self['connections']:
#cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
if isinstance(rel, bytes):
rel = rel.decode('utf-8')
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
def get_conn(c):
r = {
'id': c[0],
@ -516,14 +522,14 @@ class Imdb(SiteParser):
if len(description) == 2 and description[-1].strip() != '-':
r['description'] = description[-1].strip()
return r
cc[unicode(rel)] = map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data))
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc
for key in ('country', 'genre'):
if key in self:
self[key] = filter(lambda x: x.lower() != 'home', self[key])
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#0092999
if '_director' in self:
if 'series' in self or 'isSeries' in self:
@ -590,8 +596,8 @@ class Imdb(SiteParser):
if key in self:
if isinstance(self[key][0], list):
self[key] = [i[0] for i in self[key] if i]
self[key] = sorted(list(set(self[key])),
lambda a, b: self[key].index(a) - self[key].index(b))
self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
if 'budget' in self and 'gross' in self:
self['profit'] = self['gross'] - self['budget']
@ -655,7 +661,7 @@ def get_movie_by_title(title, timeout=-1):
u'0866567'
'''
params = {'s':'tt','q': title}
if isinstance(title, unicode):
if not isinstance(title, bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
@ -731,7 +737,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
if year:
params['q'] = u'"%s (%s)" %s' % (title, year, director)
google_query = "site:imdb.com %s" % params['q']
if isinstance(params['q'], unicode):
if not isinstance(params['q'], bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
@ -775,7 +781,7 @@ def get_movie_poster(imdbId):
info = ImdbCombined(imdbId)
if 'posterId' in info:
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url)
data = read_url(url).decode('utf-8', 'ignore')
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
return poster
elif 'series' in info:
@ -787,11 +793,11 @@ def get_episodes(imdbId, season=None):
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
if season:
url += '?season=%d' % season
data = ox.cache.read_url(url)
data = cache.read_url(url)
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
else:
data = ox.cache.read_url(url)
data = cache.read_url(url)
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
if match:
for season in range(1, int(match[0]) + 1):
@ -800,7 +806,7 @@ def get_episodes(imdbId, season=None):
def max_votes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = ox.cache.read_url(url)
data = cache.read_url(url)
votes = max([int(v.replace(',', ''))
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
return votes
@ -810,6 +816,6 @@ def guess(title, director='', timeout=-1):
if __name__ == "__main__":
import json
print json.dumps(Imdb('0306414'), indent=2)
print(json.dumps(Imdb('0306414'), indent=2))
#print json.dumps(Imdb('0133093'), indent=2)

View file

@ -1,5 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re
from ox.net import read_url
@ -13,5 +15,5 @@ def get_poster_url(id):
return ''
if __name__ == '__main__':
print get_poster_url('0749451')
print(get_poster_url('0749451'))

View file

@ -2,22 +2,24 @@
# vi:si:et:sw=4:sts=4:ts=4
import re
from six import string_types
from ..cache import read_url
from .. import strip_tags, decode_html
from .. import decode_html
from ..utils import datetime
def cleanup(key, data, data_type):
if data:
if isinstance(data[0], basestring):
if isinstance(data[0], string_types):
#FIXME: some types need strip_tags
#data = [strip_tags(decode_html(p)).strip() for p in data]
data = [decode_html(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data]
while len(data) == 1 and not isinstance(data, basestring):
while len(data) == 1 and not isinstance(data, string_types):
data = data[0]
if data_type == 'list' and isinstance(data, basestring):
if data_type == 'list' and isinstance(data, string_types):
data = [data, ]
elif data_type != 'list':
data = ''
@ -40,7 +42,7 @@ class SiteParser(dict):
for key in self.regex:
url = self.get_url(self.regex[key]['page'])
data = self.read_url(url, timeout)
if isinstance(self.regex[key]['re'], basestring):
if isinstance(self.regex[key]['re'], string_types):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
data = cleanup(key, data, self.regex[key]['type'])
elif callable(self.regex[key]['re']):
@ -51,7 +53,7 @@ class SiteParser(dict):
f = r
else:
f = re.compile(r, re.DOTALL).findall
if isinstance(data, basestring):
if isinstance(data, string_types):
data = f(data)
else:
data = [f(d) for d in data]

View file

@ -1,11 +1,14 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re
from urllib import urlencode
from six.moves import urllib
from ox.utils import json
from ox.cache import read_url
from ox import find_re, decode_html
from ox import find_re
def get_id(url):
@ -138,11 +141,11 @@ def get_allmovie_id(wikipedia_url):
def find(query, max_results=10):
query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
data = read_url(url)
if not data:
data = read_url(url, timeout=0)
result = json.loads(data)
result = json.loads(data.decode('utf-8'))
results = []
if result and 'query' in result:
for r in result['query']['search']: