use six to support python 2 and 3
This commit is contained in:
parent
1b1dcf1c58
commit
d4d09b56b6
28 changed files with 1730 additions and 1678 deletions
|
|
@ -1,23 +1,27 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import urllib
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
import time
|
||||
import unicodedata
|
||||
|
||||
import ox
|
||||
from ox import find_re, strip_tags
|
||||
import ox.cache
|
||||
from six.moves import urllib
|
||||
from six import string_types
|
||||
|
||||
from siteparser import SiteParser
|
||||
import duckduckgo
|
||||
|
||||
from .. import find_re, strip_tags, decode_html
|
||||
from .. import cache
|
||||
|
||||
|
||||
from . siteparser import SiteParser
|
||||
from . import duckduckgo
|
||||
from ..utils import datetime
|
||||
from ..geo import normalize_country_name
|
||||
|
||||
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.imdb.com/title/tt%s/" % id
|
||||
|
|
@ -49,7 +53,7 @@ class Imdb(SiteParser):
|
|||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Budget</h5>\s*?\$(.*?)<br',
|
||||
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
|
||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
|
|
@ -211,7 +215,7 @@ class Imdb(SiteParser):
|
|||
'page': 'releaseinfo',
|
||||
're': [
|
||||
'<td class="release_date">(.*?)</td>',
|
||||
ox.strip_tags,
|
||||
strip_tags,
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
|
|
@ -326,7 +330,7 @@ class Imdb(SiteParser):
|
|||
|
||||
if 'alternativeTitles' in self:
|
||||
if len(self['alternativeTitles']) == 2 and \
|
||||
isinstance(self['alternativeTitles'][0], basestring):
|
||||
isinstance(self['alternativeTitles'][0], string_types):
|
||||
self['alternativeTitles'] = [self['alternativeTitles']]
|
||||
|
||||
#normalize country names
|
||||
|
|
@ -472,7 +476,7 @@ class Imdb(SiteParser):
|
|||
if c:
|
||||
alt[title].append(c)
|
||||
self['alternativeTitles'] = []
|
||||
for t in sorted(alt, lambda a, b: cmp(sorted(alt[a]), sorted(alt[b]))):
|
||||
for t in sorted(alt, key=lambda a: sorted(alt[a])):
|
||||
if alt[t]:
|
||||
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
|
||||
self['alternativeTitles'].append((t, countries))
|
||||
|
|
@ -492,7 +496,7 @@ class Imdb(SiteParser):
|
|||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||
|
||||
if 'cast' in self:
|
||||
if isinstance(self['cast'][0], basestring):
|
||||
if isinstance(self['cast'][0], string_types):
|
||||
self['cast'] = [self['cast']]
|
||||
self['actor'] = [c[0] for c in self['cast']]
|
||||
def cleanup_character(c):
|
||||
|
|
@ -503,10 +507,12 @@ class Imdb(SiteParser):
|
|||
|
||||
if 'connections' in self:
|
||||
cc={}
|
||||
if len(self['connections']) == 3 and isinstance(self['connections'][0], basestring):
|
||||
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
|
||||
self['connections'] = [self['connections']]
|
||||
for rel, data, _ in self['connections']:
|
||||
#cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
|
||||
if isinstance(rel, bytes):
|
||||
rel = rel.decode('utf-8')
|
||||
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
|
||||
def get_conn(c):
|
||||
r = {
|
||||
'id': c[0],
|
||||
|
|
@ -516,14 +522,14 @@ class Imdb(SiteParser):
|
|||
if len(description) == 2 and description[-1].strip() != '-':
|
||||
r['description'] = description[-1].strip()
|
||||
return r
|
||||
cc[unicode(rel)] = map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data))
|
||||
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
||||
|
||||
|
||||
self['connections'] = cc
|
||||
|
||||
for key in ('country', 'genre'):
|
||||
if key in self:
|
||||
self[key] = filter(lambda x: x.lower() != 'home', self[key])
|
||||
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
|
||||
#0092999
|
||||
if '_director' in self:
|
||||
if 'series' in self or 'isSeries' in self:
|
||||
|
|
@ -590,8 +596,8 @@ class Imdb(SiteParser):
|
|||
if key in self:
|
||||
if isinstance(self[key][0], list):
|
||||
self[key] = [i[0] for i in self[key] if i]
|
||||
self[key] = sorted(list(set(self[key])),
|
||||
lambda a, b: self[key].index(a) - self[key].index(b))
|
||||
self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
|
||||
|
||||
|
||||
if 'budget' in self and 'gross' in self:
|
||||
self['profit'] = self['gross'] - self['budget']
|
||||
|
|
@ -655,7 +661,7 @@ def get_movie_by_title(title, timeout=-1):
|
|||
u'0866567'
|
||||
'''
|
||||
params = {'s':'tt','q': title}
|
||||
if isinstance(title, unicode):
|
||||
if not isinstance(title, bytes):
|
||||
try:
|
||||
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
|
||||
except:
|
||||
|
|
@ -731,7 +737,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
|||
if year:
|
||||
params['q'] = u'"%s (%s)" %s' % (title, year, director)
|
||||
google_query = "site:imdb.com %s" % params['q']
|
||||
if isinstance(params['q'], unicode):
|
||||
if not isinstance(params['q'], bytes):
|
||||
try:
|
||||
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
|
||||
except:
|
||||
|
|
@ -775,7 +781,7 @@ def get_movie_poster(imdbId):
|
|||
info = ImdbCombined(imdbId)
|
||||
if 'posterId' in info:
|
||||
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||
data = read_url(url)
|
||||
data = read_url(url).decode('utf-8', 'ignore')
|
||||
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
|
||||
return poster
|
||||
elif 'series' in info:
|
||||
|
|
@ -787,11 +793,11 @@ def get_episodes(imdbId, season=None):
|
|||
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
|
||||
if season:
|
||||
url += '?season=%d' % season
|
||||
data = ox.cache.read_url(url)
|
||||
data = cache.read_url(url)
|
||||
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
||||
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
|
||||
else:
|
||||
data = ox.cache.read_url(url)
|
||||
data = cache.read_url(url)
|
||||
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
|
||||
if match:
|
||||
for season in range(1, int(match[0]) + 1):
|
||||
|
|
@ -800,7 +806,7 @@ def get_episodes(imdbId, season=None):
|
|||
|
||||
def max_votes():
|
||||
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
||||
data = ox.cache.read_url(url)
|
||||
data = cache.read_url(url)
|
||||
votes = max([int(v.replace(',', ''))
|
||||
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
|
||||
return votes
|
||||
|
|
@ -810,6 +816,6 @@ def guess(title, director='', timeout=-1):
|
|||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
print json.dumps(Imdb('0306414'), indent=2)
|
||||
print(json.dumps(Imdb('0306414'), indent=2))
|
||||
#print json.dumps(Imdb('0133093'), indent=2)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue