add one dailymotion function
This commit is contained in:
parent
40185f89ab
commit
7668ceafc1
2 changed files with 114 additions and 81 deletions
15
ox/dailymotion.py
Normal file
15
ox/dailymotion.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import re
|
||||||
|
from urllib import unquote
|
||||||
|
from oxutils.cache import getUrl
|
||||||
|
|
||||||
|
def getVideoUrl(url):
|
||||||
|
data = getUrl(url)
|
||||||
|
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||||
|
for v in video:
|
||||||
|
v = unquote(v).split('@@')[0]
|
||||||
|
return "http://www.dailymotion.com" + v
|
||||||
|
return ''
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
|
||||||
|
print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
|
180
ox/imdb.py
180
ox/imdb.py
|
@ -38,14 +38,22 @@ def getUrlBase(imdbId):
|
||||||
|
|
||||||
def getRawMovieData(imdbId):
|
def getRawMovieData(imdbId):
|
||||||
imdbId = normalizeImdbId(imdbId)
|
imdbId = normalizeImdbId(imdbId)
|
||||||
data = dict()
|
data = getMovieInfo(imdbId)
|
||||||
data['title'] = getTitle(imdbId)
|
data['credits'] = getMovieCredits(imdbId)
|
||||||
data['credits'] = getCredits(imdbId)
|
data['poster'] = getMoviePoster(imdbId)
|
||||||
data['poster'] = getPoster(imdbId)
|
data['connections'] = getMovieConnections(imdbId)
|
||||||
data['trailers'] = getMovieTrailers(imdbId)
|
data['company credits'] = getMovieCompanyCredits(imdbId)
|
||||||
data['companyCredits'] = getMovieCompanyCredits(imdbId)
|
data['filming locations'] = getMovieLocations(imdbId)
|
||||||
|
data['movie connections'] = getMovieConnections(imdbId)
|
||||||
|
data['external reviews'] = getMovieExternalReviews(imdbId)
|
||||||
|
data['trivia'] = getMovieTrivia(imdbId)
|
||||||
|
data['keywords'] = getMovieKeywords(imdbId)
|
||||||
|
data['media'] = {}
|
||||||
|
data['media']['images'] = getMovieImages(imdbId)
|
||||||
|
data['media']['trailers'] = getMovieTrailers(imdbId)
|
||||||
|
return data
|
||||||
|
|
||||||
def parseBase(imdbId):
|
def getMovieInfo(imdbId):
|
||||||
data = getUrl(getUrlBase(imdbId))
|
data = getUrl(getUrlBase(imdbId))
|
||||||
soup = BeautifulSoup(data)
|
soup = BeautifulSoup(data)
|
||||||
info = dict()
|
info = dict()
|
||||||
|
@ -76,6 +84,7 @@ def parseBase(imdbId):
|
||||||
|
|
||||||
#get Title
|
#get Title
|
||||||
title = ''
|
title = ''
|
||||||
|
year = ''
|
||||||
html_title = soup('div', {'id': 'tn15title'})
|
html_title = soup('div', {'id': 'tn15title'})
|
||||||
if not html_title:
|
if not html_title:
|
||||||
html_title = soup('title')
|
html_title = soup('title')
|
||||||
|
@ -84,8 +93,11 @@ def parseBase(imdbId):
|
||||||
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
||||||
title = htmldecode(html_title)
|
title = htmldecode(html_title)
|
||||||
title = stripTags(title)
|
title = stripTags(title)
|
||||||
title = re.sub('\(\d\d\d\d\)', '', title)
|
year = findRegexp(title, '\((\d{4})\)')
|
||||||
title = re.sub('\(\d\d\d\d/I*\)', '', title)
|
if not year:
|
||||||
|
year = findRegexp(title, '\((\d{4})')
|
||||||
|
title = re.sub('\(\d{4}\)', '', title)
|
||||||
|
title = re.sub('\(\d{4}/I*\)', '', title)
|
||||||
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||||
title = title.replace(t, '')
|
title = title.replace(t, '')
|
||||||
title = title.strip()
|
title = title.strip()
|
||||||
|
@ -94,14 +106,34 @@ def parseBase(imdbId):
|
||||||
if title.startswith('"') and title.endswith('"'):
|
if title.startswith('"') and title.endswith('"'):
|
||||||
title = title[1:-1]
|
title = title[1:-1]
|
||||||
info['title'] = title
|
info['title'] = title
|
||||||
|
info['year'] = year
|
||||||
|
'''
|
||||||
|
#Rating
|
||||||
|
rating = findRegexp(data, '<b>(.*?)/10</b>')
|
||||||
|
if rating:
|
||||||
|
info['rating'] = int(float(rating) * 1000)
|
||||||
|
else:
|
||||||
|
info['rating'] = -1
|
||||||
|
|
||||||
|
#Votes
|
||||||
|
votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
|
||||||
|
if votes:
|
||||||
|
info['votes'] = int(votes.replace(',', ''))
|
||||||
|
else:
|
||||||
|
info['votes'] = -1
|
||||||
|
'''
|
||||||
return info
|
return info
|
||||||
|
|
||||||
def getPoster(imdbId):
|
def getMoviePoster(imdbId):
|
||||||
info = parseBase(imdbId)
|
info = getMovieInfo(imdbId)
|
||||||
return info['poster']
|
return info['poster']
|
||||||
|
|
||||||
def getTitle(imdbId):
|
def getMovieYear(imdbId):
|
||||||
info = parseBase(imdbId)
|
info = getMovieInfo(imdbId)
|
||||||
|
return info['year']
|
||||||
|
|
||||||
|
def getMovieTitle(imdbId):
|
||||||
|
info = getMovieInfo(imdbId)
|
||||||
return info['title']
|
return info['title']
|
||||||
|
|
||||||
def creditList(data, section=None):
|
def creditList(data, section=None):
|
||||||
|
@ -118,7 +150,7 @@ def creditList(data, section=None):
|
||||||
credits.append(c)
|
credits.append(c)
|
||||||
return credits
|
return credits
|
||||||
|
|
||||||
def getCredits(imdbId):
|
def getMovieCredits(imdbId):
|
||||||
credits = dict()
|
credits = dict()
|
||||||
url = "%s/fullcredits" % getUrlBase(imdbId)
|
url = "%s/fullcredits" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
|
@ -216,6 +248,52 @@ def getMovieTrivia(imdbId):
|
||||||
trivia.append(t)
|
trivia.append(t)
|
||||||
return trivia
|
return trivia
|
||||||
|
|
||||||
|
def getMovieConnections(imdbId):
|
||||||
|
url = "%s/movieconnections" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
connections = {}
|
||||||
|
content = soup('div', {'id': 'tn15content'})[0]
|
||||||
|
blocks = str(content).split('<h5>')[1:]
|
||||||
|
for c in blocks:
|
||||||
|
connection = c.split('</h5>')[0]
|
||||||
|
cs = BeautifulSoup(c)
|
||||||
|
if connection:
|
||||||
|
#relation -> list of imdb ids
|
||||||
|
connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
|
||||||
|
return connections
|
||||||
|
|
||||||
|
def getMovieKeywords(imdbId):
|
||||||
|
url = "%s/keywords" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
keywords = []
|
||||||
|
for key in soup('a', {'href': re.compile('^/keyword/')}):
|
||||||
|
k = htmldecode(key.string)
|
||||||
|
k = k.replace(u'\xa0', ' ')
|
||||||
|
keywords.append(k)
|
||||||
|
return keywords
|
||||||
|
|
||||||
|
|
||||||
|
def getMovieExternalReviews(imdbId):
|
||||||
|
url = "%s/externalreviews" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
ol = soup('ol')
|
||||||
|
if ol:
|
||||||
|
ol = ol[0]
|
||||||
|
ret = {}
|
||||||
|
for li in ol('li'):
|
||||||
|
try:
|
||||||
|
a = li('a')[0]
|
||||||
|
href = a.get('href')
|
||||||
|
txt = a.contents[0]
|
||||||
|
ret[href] = txt
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return ret
|
||||||
|
return {}
|
||||||
|
|
||||||
'''the old code below'''
|
'''the old code below'''
|
||||||
|
|
||||||
class IMDb:
|
class IMDb:
|
||||||
|
@ -224,14 +302,10 @@ class IMDb:
|
||||||
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
||||||
|
|
||||||
self.businessUrl = "%sbusiness" % self.pageUrl
|
self.businessUrl = "%sbusiness" % self.pageUrl
|
||||||
self.connectionsUrl = "%smovieconnections" % self.pageUrl
|
|
||||||
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
||||||
self.episodesUrl = "%sepisodes" % self.pageUrl
|
self.episodesUrl = "%sepisodes" % self.pageUrl
|
||||||
self.keywordUrl = "%skeywords" % self.pageUrl
|
|
||||||
self.plotUrl = "%splotsummary" % self.pageUrl
|
self.plotUrl = "%splotsummary" % self.pageUrl
|
||||||
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
||||||
self.locationUrl = "%slocations" % self.pageUrl
|
|
||||||
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
|
||||||
|
|
||||||
def getPage(self):
|
def getPage(self):
|
||||||
return getUrlUnicode(self.pageUrl)
|
return getUrlUnicode(self.pageUrl)
|
||||||
|
@ -293,7 +367,7 @@ class IMDb:
|
||||||
return parsed_value
|
return parsed_value
|
||||||
|
|
||||||
def parseTitle(self):
|
def parseTitle(self):
|
||||||
title = getTitle(self.imdb)
|
title = getMovieTitle(self.imdb)
|
||||||
title = normalizeTitle(title)
|
title = normalizeTitle(title)
|
||||||
if title.startswith('"') and title.find('"',1) > 0 and \
|
if title.startswith('"') and title.find('"',1) > 0 and \
|
||||||
title.find('"',1) == title.rfind('"'):
|
title.find('"',1) == title.rfind('"'):
|
||||||
|
@ -328,7 +402,7 @@ class IMDb:
|
||||||
data = self.getPage()
|
data = self.getPage()
|
||||||
IMDbDict ={}
|
IMDbDict ={}
|
||||||
#Poster
|
#Poster
|
||||||
IMDbDict['poster'] = getPoster(self.imdb)
|
IMDbDict['poster'] = getMoviePoster(self.imdb)
|
||||||
if not IMDbDict['poster']:
|
if not IMDbDict['poster']:
|
||||||
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
||||||
#Title, Year
|
#Title, Year
|
||||||
|
@ -373,16 +447,16 @@ class IMDb:
|
||||||
IMDbDict['tvshow'] = False
|
IMDbDict['tvshow'] = False
|
||||||
IMDbDict['credits'] = self.getCredits()
|
IMDbDict['credits'] = self.getCredits()
|
||||||
IMDbDict['plot'] = self.parsePlot()
|
IMDbDict['plot'] = self.parsePlot()
|
||||||
IMDbDict['keywords'] = self.parseKeywords()
|
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
|
||||||
|
|
||||||
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
||||||
IMDbDict['connections'] = self.parseConnections()
|
IMDbDict['connections'] = getMovieConnections(self.imdb)
|
||||||
IMDbDict['locations'] = self.parseLocations()
|
IMDbDict['locations'] = getMovieLocations(self.imdb)
|
||||||
IMDbDict['release_date'] = self.parseReleaseinfo()
|
IMDbDict['release_date'] = self.parseReleaseinfo()
|
||||||
IMDbDict['business'] = self.parseBusiness()
|
IMDbDict['business'] = self.parseBusiness()
|
||||||
IMDbDict['reviews'] = self.parseExternalreviews()
|
IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
|
||||||
IMDbDict['stills'] = getMovieStills(self.imdb)
|
IMDbDict['stills'] = getMovieStills(self.imdb)
|
||||||
#IMDbDict['trailer'] = self.parseTrailer()
|
#IMDbDict['trailer'] = getMovieTrailer(self.imdb)
|
||||||
self.IMDbDict = IMDbDict
|
self.IMDbDict = IMDbDict
|
||||||
|
|
||||||
if IMDbDict['episode_of']:
|
if IMDbDict['episode_of']:
|
||||||
|
@ -452,42 +526,6 @@ class IMDb:
|
||||||
self.episodes = episodes
|
self.episodes = episodes
|
||||||
return self.episodes
|
return self.episodes
|
||||||
|
|
||||||
def parseLocations(self):
|
|
||||||
data = getUrlUnicode(self.locationUrl)
|
|
||||||
soup = BeautifulSoup(data)
|
|
||||||
locations = []
|
|
||||||
for key in soup('a', {'href': re.compile('^/List')}):
|
|
||||||
locations.append(htmldecode(key.string))
|
|
||||||
self.locations = locations
|
|
||||||
return self.locations
|
|
||||||
|
|
||||||
def parseKeywords(self):
|
|
||||||
data = getUrlUnicode(self.keywordUrl)
|
|
||||||
soup = BeautifulSoup(data)
|
|
||||||
keywords = []
|
|
||||||
for key in soup('a', {'href': re.compile('^/keyword/')}):
|
|
||||||
k = htmldecode(key.string)
|
|
||||||
k = k.replace(u'\xa0', ' ')
|
|
||||||
keywords.append(k)
|
|
||||||
self.keywords = keywords
|
|
||||||
return self.keywords
|
|
||||||
|
|
||||||
def getConnections(self):
|
|
||||||
return getUrlUnicode(self.connectionsUrl)
|
|
||||||
|
|
||||||
def parseConnections(self):
|
|
||||||
connections = {}
|
|
||||||
soup = BeautifulSoup(self.getConnections())
|
|
||||||
content = soup('div', {'id': 'tn15content'})[0]
|
|
||||||
blocks = str(content).split('<h5>')[1:]
|
|
||||||
for c in blocks:
|
|
||||||
connection = c.split('</h5>')[0]
|
|
||||||
cs = BeautifulSoup(c)
|
|
||||||
if connection:
|
|
||||||
#relation -> list of imdb ids
|
|
||||||
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
|
|
||||||
return connections
|
|
||||||
|
|
||||||
def getReleaseinfo(self):
|
def getReleaseinfo(self):
|
||||||
return getUrlUnicode(self.releaseinfoUrl)
|
return getUrlUnicode(self.releaseinfoUrl)
|
||||||
|
|
||||||
|
@ -530,26 +568,6 @@ class IMDb:
|
||||||
business['profit'] = business['gross'] - business['budget']
|
business['profit'] = business['gross'] - business['budget']
|
||||||
return business
|
return business
|
||||||
|
|
||||||
def getExternalreviews(self):
|
|
||||||
return getUrlUnicode(self.externalreviewsUrl)
|
|
||||||
|
|
||||||
def parseExternalreviews(self):
|
|
||||||
soup = BeautifulSoup(self.getExternalreviews())
|
|
||||||
ol = soup('ol')
|
|
||||||
if ol:
|
|
||||||
ol = ol[0]
|
|
||||||
ret = {}
|
|
||||||
for li in ol('li'):
|
|
||||||
try:
|
|
||||||
a = li('a')[0]
|
|
||||||
href = a.get('href')
|
|
||||||
txt = a.contents[0]
|
|
||||||
ret[href] = txt
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return ret
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def guess(title, director=''):
|
def guess(title, director=''):
|
||||||
#FIXME: proper file -> title
|
#FIXME: proper file -> title
|
||||||
title = title.split('-')[0]
|
title = title.split('-')[0]
|
||||||
|
|
Loading…
Reference in a new issue