- changes to imdb.py
* user more oxutils functions * start migrating to a raw dict, first part, parse full cast with names from imdb * add getMovieId
This commit is contained in:
parent
7a53ee62b9
commit
69adaeee00
3 changed files with 165 additions and 149 deletions
|
@ -4,5 +4,7 @@
|
||||||
|
|
||||||
__version__ = '0.1.0'
|
__version__ = '0.1.0'
|
||||||
|
|
||||||
from net import *
|
import imdb
|
||||||
|
import wikipedia
|
||||||
|
import google
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,6 @@ from oxutils import stripTags
|
||||||
usage:
|
usage:
|
||||||
import google
|
import google
|
||||||
google.find(query)
|
google.find(query)
|
||||||
<generator object at 0x833aeac>
|
|
||||||
|
|
||||||
for result in google.find(query): result
|
for result in google.find(query): result
|
||||||
|
|
||||||
|
|
309
ox/imdb.py
309
ox/imdb.py
|
@ -12,19 +12,102 @@ import time
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup
|
||||||
import chardet
|
import chardet
|
||||||
import oxutils
|
import oxutils
|
||||||
from oxutils import stripTags, htmldecode
|
from oxutils import stripTags, htmldecode, findRegexp
|
||||||
from oxutils.cache import getUrl, getUrlUnicode
|
from oxutils.cache import getUrl, getUrlUnicode
|
||||||
from oxutils.normalize import normalizeTitle
|
from oxutils.normalize import normalizeTitle, normalizeImdbId
|
||||||
|
|
||||||
import google
|
import google
|
||||||
|
|
||||||
def _get_data(url):
|
def getMovieId(title, director='', year=''):
|
||||||
data = None
|
if year:
|
||||||
try:
|
title = "%s (%s)" % (title, year)
|
||||||
data = getUrl(url)
|
if director:
|
||||||
except:
|
query = 'site:imdb.com %s "%s"' % (director, title)
|
||||||
print "error reading data from", url
|
else:
|
||||||
return data
|
query = 'site:imdb.com "%s"' % title
|
||||||
|
for (name, url, desc) in google.find(query, 3):
|
||||||
|
if url.startswith('http://www.imdb.com/title/tt'):
|
||||||
|
return url[28:35]
|
||||||
|
|
||||||
|
def getMovieData(imdbId):
|
||||||
|
return IMDb(imdbId).parse()
|
||||||
|
|
||||||
|
# internal functions below
|
||||||
|
def getUrlBase(imdbId):
|
||||||
|
return "http://www.imdb.com/title/tt%s" % imdbId
|
||||||
|
|
||||||
|
def getRawMovieData(imdbId):
|
||||||
|
imdbId = normalizeImdbId(imdbId)
|
||||||
|
data = dict()
|
||||||
|
data['credits'] = parseCredits(imdbId)
|
||||||
|
data['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||||
|
|
||||||
|
def parseBase(imdbId):
|
||||||
|
data = getUrl(getUrlBase(imdbId))
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
info = dict()
|
||||||
|
for i in soup('div', {'class':'info'}):
|
||||||
|
|
||||||
|
title = i('h5')
|
||||||
|
if title:
|
||||||
|
title=title[0]
|
||||||
|
txt = title.findNext()
|
||||||
|
title = stripTags(unicode(title))
|
||||||
|
if title.endswith(':'):
|
||||||
|
title = title[:-1]
|
||||||
|
info[title] = htmldecode(stripTags(unicode(txt)))
|
||||||
|
return info
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def getTitle(imdbId):
|
||||||
|
title = ''
|
||||||
|
data = getUrl(getUrlBase(imdbId))
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
html_title = soup('div', {'id': 'tn15title'})
|
||||||
|
if not html_title:
|
||||||
|
html_title = soup('title')
|
||||||
|
if html_title:
|
||||||
|
html_title = str(html_title[0])
|
||||||
|
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
||||||
|
title = htmldecode(html_title)
|
||||||
|
title = stripTags(title)
|
||||||
|
title = re.sub('\(\d\d\d\d\)', '', title)
|
||||||
|
title = re.sub('\(\d\d\d\d/I*\)', '', title)
|
||||||
|
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||||
|
title = title.replace(t, '')
|
||||||
|
title = title.strip()
|
||||||
|
if title.find(u'\xa0') > -1:
|
||||||
|
title = title[:title.find(u'\xa0')]
|
||||||
|
if title.startswith('"') and title.endswith('"'):
|
||||||
|
title = title[1:-1]
|
||||||
|
return title
|
||||||
|
|
||||||
|
def creditList(data, section=None):
|
||||||
|
if section == 'cast':
|
||||||
|
credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
|
||||||
|
else:
|
||||||
|
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
|
||||||
|
credits = []
|
||||||
|
for c_ in credits_:
|
||||||
|
c = [c_[0].strip(), c_[1].strip()]
|
||||||
|
if section=='writers':
|
||||||
|
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
|
||||||
|
if c[1].endswith(' and'): c[1] = c[1][:-4]
|
||||||
|
credits.append(c)
|
||||||
|
return credits
|
||||||
|
|
||||||
|
def parseCredits(imdbId):
|
||||||
|
credits = dict()
|
||||||
|
url = "%s/fullcredits" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
groups = data.split('<h5>')
|
||||||
|
for g in groups:
|
||||||
|
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
|
||||||
|
if section:
|
||||||
|
credits[section[0]] = creditList(g, section[0])
|
||||||
|
return credits
|
||||||
|
|
||||||
|
'''the old code below'''
|
||||||
|
|
||||||
def get_image(url):
|
def get_image(url):
|
||||||
return getUrl(url)
|
return getUrl(url)
|
||||||
|
@ -42,62 +125,36 @@ def _castList(data, regexp):
|
||||||
return names
|
return names
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def _getTerm(data, regexp):
|
|
||||||
term = ''
|
|
||||||
try:
|
|
||||||
reg = re.compile(regexp, re.IGNORECASE)
|
|
||||||
m = reg.search(data)
|
|
||||||
if m:
|
|
||||||
term = stripTags(m.group(1)).strip()
|
|
||||||
except:
|
|
||||||
print "waring, parsing failed for", regexp
|
|
||||||
return term.encode('utf8')
|
|
||||||
|
|
||||||
|
|
||||||
class IMDb:
|
class IMDb:
|
||||||
def __init__(self, imdb):
|
def __init__(self, imdbId):
|
||||||
self.imdb = imdb
|
self.imdb = imdbId
|
||||||
self.pageSource = None
|
|
||||||
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
||||||
|
|
||||||
self.businessSource = None
|
|
||||||
self.businessUrl = "%sbusiness" % self.pageUrl
|
self.businessUrl = "%sbusiness" % self.pageUrl
|
||||||
self.connectionsSource = None
|
|
||||||
self.connectionsUrl = "%smovieconnections" % self.pageUrl
|
self.connectionsUrl = "%smovieconnections" % self.pageUrl
|
||||||
self.creditsSource = None
|
|
||||||
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
||||||
self.episodesSource = None
|
|
||||||
self.episodesUrl = "%sepisodes" % self.pageUrl
|
self.episodesUrl = "%sepisodes" % self.pageUrl
|
||||||
self.keywordSource = None
|
|
||||||
self.keywordUrl = "%skeywords" % self.pageUrl
|
self.keywordUrl = "%skeywords" % self.pageUrl
|
||||||
self.plotSource = None
|
|
||||||
self.plotUrl = "%splotsummary" % self.pageUrl
|
self.plotUrl = "%splotsummary" % self.pageUrl
|
||||||
self.releaseinfoSource = None
|
|
||||||
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
||||||
self.triviaSource = None
|
|
||||||
self.triviaUrl = "%strivia" % self.pageUrl
|
self.triviaUrl = "%strivia" % self.pageUrl
|
||||||
self.locationSource = None
|
|
||||||
self.locationUrl = "%slocations" % self.pageUrl
|
self.locationUrl = "%slocations" % self.pageUrl
|
||||||
self.externalreviewsSource = None
|
|
||||||
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
||||||
self.trailerSource = None
|
|
||||||
self.trailerUrl = "%strailers" % self.pageUrl
|
self.trailerUrl = "%strailers" % self.pageUrl
|
||||||
|
|
||||||
def getPage(self, forcereload = False):
|
def getPage(self):
|
||||||
if forcereload or not self.pageSource:
|
return getUrlUnicode(self.pageUrl)
|
||||||
self.pageSource = getUrlUnicode(self.pageUrl)
|
|
||||||
return self.pageSource
|
|
||||||
|
|
||||||
def parse_raw_value(self, key, value):
|
def parse_raw_value(self, key, value):
|
||||||
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
|
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
|
||||||
value = unicode(value, 'utf-8')
|
value = unicode(value, 'utf-8')
|
||||||
value = stripTags(value).strip()
|
value = stripTags(value).strip()
|
||||||
if key == 'runtime':
|
if key == 'runtime':
|
||||||
parsed_value = _getTerm(value, '(.*?) min')
|
parsed_value = findRegexp(value, '(.*?) min')
|
||||||
parsed_value = _getTerm(parsed_value, '([0-9]+)')
|
parsed_value = findRegexp(parsed_value, '([0-9]+)')
|
||||||
if not parsed_value:
|
if not parsed_value:
|
||||||
parsed_value = _getTerm(value, '(.*?) sec')
|
parsed_value = findRegexp(value, '(.*?) sec')
|
||||||
parsed_value = _getTerm(parsed_value, '([0-9]+)')
|
parsed_value = findRegexp(parsed_value, '([0-9]+)')
|
||||||
if not parsed_value:
|
if not parsed_value:
|
||||||
parsed_value = 0
|
parsed_value = 0
|
||||||
else:
|
else:
|
||||||
|
@ -141,7 +198,7 @@ class IMDb:
|
||||||
print value
|
print value
|
||||||
parsed_value = value
|
parsed_value = value
|
||||||
return parsed_value
|
return parsed_value
|
||||||
|
|
||||||
def parseTitle(self):
|
def parseTitle(self):
|
||||||
title = ''
|
title = ''
|
||||||
data = self.getPage()
|
data = self.getPage()
|
||||||
|
@ -153,8 +210,8 @@ class IMDb:
|
||||||
html_title = str(html_title[0])
|
html_title = str(html_title[0])
|
||||||
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
||||||
title = stripTags(html_title)
|
title = stripTags(html_title)
|
||||||
title = re.sub('\(\d\d\d\d\)', '', title)
|
title = re.sub('\(\d{4}\)', '', title)
|
||||||
title = re.sub('\(\d\d\d\d/I*\)', '', title)
|
title = re.sub('\(\d{4}/I*\)', '', title)
|
||||||
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||||
title = title.replace(t, '')
|
title = title.replace(t, '')
|
||||||
if title.find(u'\xa0') > -1:
|
if title.find(u'\xa0') > -1:
|
||||||
|
@ -172,7 +229,7 @@ class IMDb:
|
||||||
else:
|
else:
|
||||||
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
|
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
|
||||||
return normalizeTitle(title)
|
return normalizeTitle(title)
|
||||||
|
|
||||||
def parseYear(self):
|
def parseYear(self):
|
||||||
year = ''
|
year = ''
|
||||||
data = self.getPage()
|
data = self.getPage()
|
||||||
|
@ -183,25 +240,25 @@ class IMDb:
|
||||||
if html_title:
|
if html_title:
|
||||||
html_title = str(html_title[0])
|
html_title = str(html_title[0])
|
||||||
html_title = stripTags(html_title)
|
html_title = stripTags(html_title)
|
||||||
year = re.compile('\((\d\d\d\d)\)').findall(html_title)
|
year = re.compile('\((\d{4})\)').findall(html_title)
|
||||||
if not year:
|
if not year:
|
||||||
year = re.compile('\((\d\d\d\d)/').findall(html_title)
|
year = re.compile('\((\d{4})/').findall(html_title)
|
||||||
if year:
|
if year:
|
||||||
year = year[0]
|
year = year[0]
|
||||||
else: year = ''
|
else: year = ''
|
||||||
return year
|
return year
|
||||||
|
|
||||||
def parse(self):
|
def parse(self):
|
||||||
data = self.getPage()
|
data = self.getPage()
|
||||||
IMDbDict ={}
|
IMDbDict ={}
|
||||||
#Poster
|
#Poster
|
||||||
IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
|
IMDbDict['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||||
if not IMDbDict['poster']:
|
if not IMDbDict['poster']:
|
||||||
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
||||||
#Title, Year
|
#Title, Year
|
||||||
IMDbDict['year'] = self.parseYear()
|
IMDbDict['year'] = self.parseYear()
|
||||||
IMDbDict['title'] = self.parseTitle()
|
IMDbDict['title'] = self.parseTitle()
|
||||||
|
|
||||||
#Rating
|
#Rating
|
||||||
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
|
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
|
||||||
if m:
|
if m:
|
||||||
|
@ -251,49 +308,32 @@ class IMDb:
|
||||||
IMDbDict['stills'] = getMovieStills(self.imdb)
|
IMDbDict['stills'] = getMovieStills(self.imdb)
|
||||||
#IMDbDict['trailer'] = self.parseTrailer()
|
#IMDbDict['trailer'] = self.parseTrailer()
|
||||||
self.IMDbDict = IMDbDict
|
self.IMDbDict = IMDbDict
|
||||||
|
|
||||||
if IMDbDict['episode_of']:
|
if IMDbDict['episode_of']:
|
||||||
episode_of =IMDb(IMDbDict['episode_of']).parse()
|
episode_of =IMDb(IMDbDict['episode_of']).parse()
|
||||||
for key in ('country', 'language'):
|
for key in ('country', 'language'):
|
||||||
if not IMDbDict[key]:
|
if not IMDbDict[key]:
|
||||||
IMDbDict[key] = episode_of[key]
|
IMDbDict[key] = episode_of[key]
|
||||||
return self.IMDbDict
|
return self.IMDbDict
|
||||||
|
|
||||||
def getCredits(self, forcereload = False):
|
|
||||||
if forcereload or not self.creditsSource:
|
|
||||||
self.creditsSource = getUrlUnicode(self.creditsUrl)
|
|
||||||
return self.creditsSource
|
|
||||||
|
|
||||||
def parseCredits(self):
|
def parseCredits(self):
|
||||||
data = self.getCredits()
|
raw_credits = parseCredits(self.imdb)
|
||||||
credits = {}
|
credits = {}
|
||||||
credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
|
|
||||||
credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
|
def getNames(creditList):
|
||||||
credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
|
return [stripTags(c[0]) for c in creditList]
|
||||||
#credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
|
|
||||||
credits['cast'] = []
|
credits['director'] = getNames(raw_credits['directors'])
|
||||||
soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
|
credits['writer'] = getNames(raw_credits['writers'])
|
||||||
soup = BeautifulSoup(data)
|
credits['producer'] = getNames(raw_credits['producers'])
|
||||||
cast = soup('table', {'class': 'cast'})
|
credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']]
|
||||||
if cast:
|
|
||||||
cast = str(cast[0]).replace(u'\xa0', ' ')
|
|
||||||
names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
|
|
||||||
for name in names:
|
|
||||||
real_name = name[0]
|
|
||||||
role_name = name[1]
|
|
||||||
if role_name:
|
|
||||||
role_name = role_name.split('(')[0].replace('/ ...','')
|
|
||||||
credits['cast'].append((stripTags(real_name), stripTags(role_name)))
|
|
||||||
self.credits = credits
|
self.credits = credits
|
||||||
return self.credits
|
return self.credits
|
||||||
|
|
||||||
def getPlot(self, forcereload = False):
|
|
||||||
if forcereload or not self.plotSource:
|
|
||||||
self.plotSource = getUrlUnicode(self.plotUrl)
|
|
||||||
return self.plotSource
|
|
||||||
|
|
||||||
def parsePlot(self):
|
def parsePlot(self):
|
||||||
soup = BeautifulSoup(self.getPlot())
|
data = getUrlUnicode(self.plotUrl)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
plot = soup('p', {'class':'plotpar'})
|
plot = soup('p', {'class':'plotpar'})
|
||||||
if plot:
|
if plot:
|
||||||
plot = unicode(plot[0]).split('<i>')[0]
|
plot = unicode(plot[0]).split('<i>')[0]
|
||||||
|
@ -302,15 +342,11 @@ class IMDb:
|
||||||
plot = stripTags(plot).strip()
|
plot = stripTags(plot).strip()
|
||||||
self.plot = plot
|
self.plot = plot
|
||||||
return plot
|
return plot
|
||||||
|
|
||||||
def getEpisodes(self, forcereload = False):
|
|
||||||
if forcereload or not self.episodesSource:
|
|
||||||
self.episodesSource = getUrlUnicode(self.episodesUrl)
|
|
||||||
return self.episodesSource
|
|
||||||
|
|
||||||
def parseEpisodes(self):
|
def parseEpisodes(self):
|
||||||
episodes = {}
|
episodes = {}
|
||||||
cdata = self.getEpisodes().replace('\r\n', ' ')
|
data = getUrlUnicode(self.episodesUrl)
|
||||||
|
cdata = data.replace('\r\n', ' ')
|
||||||
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
|
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
|
||||||
reg = re.compile(regexp, re.IGNORECASE)
|
reg = re.compile(regexp, re.IGNORECASE)
|
||||||
m = reg.findall(cdata)
|
m = reg.findall(cdata)
|
||||||
|
@ -340,26 +376,18 @@ class IMDb:
|
||||||
self.episodes = episodes
|
self.episodes = episodes
|
||||||
return self.episodes
|
return self.episodes
|
||||||
|
|
||||||
def getLocations(self, forcereload = False):
|
|
||||||
if forcereload or not self.locationSource:
|
|
||||||
self.keywordSource = getUrlUnicode(self.locationUrl)
|
|
||||||
return self.keywordSource
|
|
||||||
|
|
||||||
def parseLocations(self):
|
def parseLocations(self):
|
||||||
soup = BeautifulSoup(self.getLocations())
|
data = getUrlUnicode(self.locationUrl)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
locations = []
|
locations = []
|
||||||
for key in soup('a', {'href': re.compile('^/List')}):
|
for key in soup('a', {'href': re.compile('^/List')}):
|
||||||
locations.append(htmldecode(key.string))
|
locations.append(htmldecode(key.string))
|
||||||
self.locations = locations
|
self.locations = locations
|
||||||
return self.locations
|
return self.locations
|
||||||
|
|
||||||
def getKeywords(self, forcereload = False):
|
|
||||||
if forcereload or not self.keywordSource:
|
|
||||||
self.keywordSource = getUrlUnicode(self.keywordUrl)
|
|
||||||
return self.keywordSource
|
|
||||||
|
|
||||||
def parseKeywords(self):
|
def parseKeywords(self):
|
||||||
soup = BeautifulSoup(self.getKeywords())
|
data = getUrlUnicode(self.keywordUrl)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
keywords = []
|
keywords = []
|
||||||
for key in soup('a', {'href': re.compile('^/keyword/')}):
|
for key in soup('a', {'href': re.compile('^/keyword/')}):
|
||||||
k = htmldecode(key.string)
|
k = htmldecode(key.string)
|
||||||
|
@ -368,28 +396,23 @@ class IMDb:
|
||||||
self.keywords = keywords
|
self.keywords = keywords
|
||||||
return self.keywords
|
return self.keywords
|
||||||
|
|
||||||
def getTrivia(self, forcereload = False):
|
|
||||||
if forcereload or not self.triviaSource:
|
|
||||||
self.triviaSource = getUrlUnicode(self.triviaUrl)
|
|
||||||
return self.triviaSource
|
|
||||||
|
|
||||||
def parseTrivia(self):
|
def parseTrivia(self):
|
||||||
|
data = getUrlUnicode(self.triviaUrl)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
|
||||||
trivia = []
|
trivia = []
|
||||||
soup = BeautifulSoup(self.getTrivia())
|
|
||||||
triviaList = []
|
triviaList = []
|
||||||
for i in soup('ul', {'class': "trivia"}):
|
for i in soup('ul', {'class': "trivia"}):
|
||||||
for t in i('li'):
|
for t in i('li'):
|
||||||
t = str(t).replace('<br />', '').strip()
|
t = str(t).replace('<br />', '').strip()
|
||||||
if t.startswith('<li>') and t.endswith('</li>'):
|
if t.startswith('<li>') and t.endswith('</li>'):
|
||||||
t = t[4:-5].strip()
|
t = t[4:-5].strip()
|
||||||
trivia.append(t)
|
trivia.append(t)
|
||||||
self.trivia = trivia
|
self.trivia = trivia
|
||||||
return self.trivia
|
return self.trivia
|
||||||
|
|
||||||
def getConnections(self, forcereload = False):
|
def getConnections(self):
|
||||||
if forcereload or not self.connectionsSource:
|
return getUrlUnicode(self.connectionsUrl)
|
||||||
self.connectionsSource = getUrlUnicode(self.connectionsUrl)
|
|
||||||
return self.connectionsSource
|
|
||||||
|
|
||||||
def parseConnections(self):
|
def parseConnections(self):
|
||||||
connections = {}
|
connections = {}
|
||||||
|
@ -404,10 +427,8 @@ class IMDb:
|
||||||
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
|
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
|
||||||
return connections
|
return connections
|
||||||
|
|
||||||
def getReleaseinfo(self, forcereload = False):
|
def getReleaseinfo(self):
|
||||||
if forcereload or not self.releaseinfoSource:
|
return getUrlUnicode(self.releaseinfoUrl)
|
||||||
self.releaseinfoSource = getUrlUnicode(self.releaseinfoUrl)
|
|
||||||
return self.releaseinfoSource
|
|
||||||
|
|
||||||
def parseReleaseinfo(self):
|
def parseReleaseinfo(self):
|
||||||
soup = BeautifulSoup(self.getReleaseinfo())
|
soup = BeautifulSoup(self.getReleaseinfo())
|
||||||
|
@ -424,12 +445,10 @@ class IMDb:
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def getBusiness(self, forcereload = False):
|
def getBusiness(self):
|
||||||
if forcereload or not self.businessSource:
|
return getUrlUnicode(self.businessUrl)
|
||||||
self.businessSource = getUrlUnicode(self.businessUrl)
|
|
||||||
return self.businessSource
|
|
||||||
|
|
||||||
def parseBusiness(self):
|
def parseBusiness(self):
|
||||||
soup = BeautifulSoup(self.getBusiness())
|
soup = BeautifulSoup(self.getBusiness())
|
||||||
business = {'budget': 0, 'gross': 0, 'profit': 0}
|
business = {'budget': 0, 'gross': 0, 'profit': 0}
|
||||||
|
@ -449,12 +468,10 @@ class IMDb:
|
||||||
if business['budget'] and business['gross']:
|
if business['budget'] and business['gross']:
|
||||||
business['profit'] = business['gross'] - business['budget']
|
business['profit'] = business['gross'] - business['budget']
|
||||||
return business
|
return business
|
||||||
|
|
||||||
def getExternalreviews(self, forcereload = False):
|
def getExternalreviews(self):
|
||||||
if forcereload or not self.externalreviewsSource:
|
return getUrlUnicode(self.externalreviewsUrl)
|
||||||
self.externalreviewsSource = getUrlUnicode(self.externalreviewsUrl)
|
|
||||||
return self.externalreviewsSource
|
|
||||||
|
|
||||||
def parseExternalreviews(self):
|
def parseExternalreviews(self):
|
||||||
soup = BeautifulSoup(self.getExternalreviews())
|
soup = BeautifulSoup(self.getExternalreviews())
|
||||||
ol = soup('ol')
|
ol = soup('ol')
|
||||||
|
@ -471,12 +488,10 @@ class IMDb:
|
||||||
pass
|
pass
|
||||||
return ret
|
return ret
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def getTrailer(self, forcereload = False):
|
def getTrailer(self):
|
||||||
if forcereload or not self.trailerSource:
|
return getUrlUnicode(self.trailerUrl)
|
||||||
self.trailerSource = getUrlUnicode(self.trailerUrl)
|
|
||||||
return self.trailerSource
|
|
||||||
|
|
||||||
def parseTrailer(self):
|
def parseTrailer(self):
|
||||||
ret = {}
|
ret = {}
|
||||||
soup = BeautifulSoup(self.getTrailer())
|
soup = BeautifulSoup(self.getTrailer())
|
||||||
|
@ -519,8 +534,8 @@ def guess(title, director=''):
|
||||||
return None
|
return None
|
||||||
if return_url.startswith('http://www.imdb.com/title/tt'):
|
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||||
return return_url[28:35]
|
return return_url[28:35]
|
||||||
if data:
|
if data:
|
||||||
imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
||||||
if imdb_id:
|
if imdb_id:
|
||||||
return imdb_id
|
return imdb_id
|
||||||
|
|
||||||
|
@ -538,7 +553,7 @@ def guess(title, director=''):
|
||||||
def getEpisodeData(title, episode, show_url = None):
|
def getEpisodeData(title, episode, show_url = None):
|
||||||
'''
|
'''
|
||||||
Collect information about an episode.
|
Collect information about an episode.
|
||||||
|
|
||||||
Returns dict with title, show, description and episode
|
Returns dict with title, show, description and episode
|
||||||
'''
|
'''
|
||||||
episodeData = {
|
episodeData = {
|
||||||
|
@ -559,13 +574,13 @@ def getEpisodeData(title, episode, show_url = None):
|
||||||
episodeData['imdb'] = i['episodes'][episode]['imdb']
|
episodeData['imdb'] = i['episodes'][episode]['imdb']
|
||||||
return episodeData
|
return episodeData
|
||||||
|
|
||||||
def getMovieStills(id):
|
def getMovieStills(imdbId):
|
||||||
data = getUrl("http://imdb.com/gallery/ss/%s" % id)
|
data = getUrl("http://imdb.com/gallery/ss/%s" % imdbId)
|
||||||
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % id).findall(data)
|
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
|
||||||
stills = []
|
stills = []
|
||||||
for s in s_:
|
for s in s_:
|
||||||
if int(s[0]) > int(s[1]):
|
if int(s[0]) > int(s[1]):
|
||||||
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (id, s[2]))
|
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
|
||||||
if not stills:
|
if not stills:
|
||||||
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
|
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
|
||||||
stills = []
|
stills = []
|
||||||
|
|
Loading…
Reference in a new issue