use six to support python 2 and 3

This commit is contained in:
j 2014-09-30 21:04:46 +02:00
commit d4d09b56b6
28 changed files with 1730 additions and 1678 deletions

View file

@ -1,23 +1,27 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import urllib
from __future__ import print_function
import re
import time
import unicodedata
import ox
from ox import find_re, strip_tags
import ox.cache
from six.moves import urllib
from six import string_types
from siteparser import SiteParser
import duckduckgo
from .. import find_re, strip_tags, decode_html
from .. import cache
from . siteparser import SiteParser
from . import duckduckgo
from ..utils import datetime
from ..geo import normalize_country_name
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
return cache.read_url(url, data, headers, timeout, unicode=unicode)
def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id
@ -49,7 +53,7 @@ class Imdb(SiteParser):
'page': 'business',
're': [
'<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
],
'type': 'int'
},
@ -211,7 +215,7 @@ class Imdb(SiteParser):
'page': 'releaseinfo',
're': [
'<td class="release_date">(.*?)</td>',
ox.strip_tags,
strip_tags,
],
'type': 'list'
},
@ -326,7 +330,7 @@ class Imdb(SiteParser):
if 'alternativeTitles' in self:
if len(self['alternativeTitles']) == 2 and \
isinstance(self['alternativeTitles'][0], basestring):
isinstance(self['alternativeTitles'][0], string_types):
self['alternativeTitles'] = [self['alternativeTitles']]
#normalize country names
@ -472,7 +476,7 @@ class Imdb(SiteParser):
if c:
alt[title].append(c)
self['alternativeTitles'] = []
for t in sorted(alt, lambda a, b: cmp(sorted(alt[a]), sorted(alt[b]))):
for t in sorted(alt, key=lambda a: sorted(alt[a])):
if alt[t]:
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
self['alternativeTitles'].append((t, countries))
@ -492,7 +496,7 @@ class Imdb(SiteParser):
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'cast' in self:
if isinstance(self['cast'][0], basestring):
if isinstance(self['cast'][0], string_types):
self['cast'] = [self['cast']]
self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c):
@ -503,10 +507,12 @@ class Imdb(SiteParser):
if 'connections' in self:
cc={}
if len(self['connections']) == 3 and isinstance(self['connections'][0], basestring):
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
self['connections'] = [self['connections']]
for rel, data, _ in self['connections']:
#cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
if isinstance(rel, bytes):
rel = rel.decode('utf-8')
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
def get_conn(c):
r = {
'id': c[0],
@ -516,14 +522,14 @@ class Imdb(SiteParser):
if len(description) == 2 and description[-1].strip() != '-':
r['description'] = description[-1].strip()
return r
cc[unicode(rel)] = map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data))
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc
for key in ('country', 'genre'):
if key in self:
self[key] = filter(lambda x: x.lower() != 'home', self[key])
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#0092999
if '_director' in self:
if 'series' in self or 'isSeries' in self:
@ -590,8 +596,8 @@ class Imdb(SiteParser):
if key in self:
if isinstance(self[key][0], list):
self[key] = [i[0] for i in self[key] if i]
self[key] = sorted(list(set(self[key])),
lambda a, b: self[key].index(a) - self[key].index(b))
self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
if 'budget' in self and 'gross' in self:
self['profit'] = self['gross'] - self['budget']
@ -655,7 +661,7 @@ def get_movie_by_title(title, timeout=-1):
u'0866567'
'''
params = {'s':'tt','q': title}
if isinstance(title, unicode):
if not isinstance(title, bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
@ -731,7 +737,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
if year:
params['q'] = u'"%s (%s)" %s' % (title, year, director)
google_query = "site:imdb.com %s" % params['q']
if isinstance(params['q'], unicode):
if not isinstance(params['q'], bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
@ -775,7 +781,7 @@ def get_movie_poster(imdbId):
info = ImdbCombined(imdbId)
if 'posterId' in info:
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url)
data = read_url(url).decode('utf-8', 'ignore')
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
return poster
elif 'series' in info:
@ -787,11 +793,11 @@ def get_episodes(imdbId, season=None):
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
if season:
url += '?season=%d' % season
data = ox.cache.read_url(url)
data = cache.read_url(url)
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
else:
data = ox.cache.read_url(url)
data = cache.read_url(url)
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
if match:
for season in range(1, int(match[0]) + 1):
@ -800,7 +806,7 @@ def get_episodes(imdbId, season=None):
def max_votes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = ox.cache.read_url(url)
data = cache.read_url(url)
votes = max([int(v.replace(',', ''))
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
return votes
@ -810,6 +816,6 @@ def guess(title, director='', timeout=-1):
if __name__ == "__main__":
import json
print json.dumps(Imdb('0306414'), indent=2)
print(json.dumps(Imdb('0306414'), indent=2))
#print json.dumps(Imdb('0133093'), indent=2)