add one dailymotion function

This commit is contained in:
j 2008-04-30 15:31:50 +02:00
parent 40185f89ab
commit 7668ceafc1
2 changed files with 114 additions and 81 deletions

15
ox/dailymotion.py Normal file
View file

@ -0,0 +1,15 @@
import re
from urllib import unquote
from oxutils.cache import getUrl
def getVideoUrl(url):
data = getUrl(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
v = unquote(v).split('@@')[0]
return "http://www.dailymotion.com" + v
return ''
if __name__ == '__main__':
print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')

View file

@ -38,14 +38,22 @@ def getUrlBase(imdbId):
def getRawMovieData(imdbId): def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId) imdbId = normalizeImdbId(imdbId)
data = dict() data = getMovieInfo(imdbId)
data['title'] = getTitle(imdbId) data['credits'] = getMovieCredits(imdbId)
data['credits'] = getCredits(imdbId) data['poster'] = getMoviePoster(imdbId)
data['poster'] = getPoster(imdbId) data['connections'] = getMovieConnections(imdbId)
data['trailers'] = getMovieTrailers(imdbId) data['company credits'] = getMovieCompanyCredits(imdbId)
data['companyCredits'] = getMovieCompanyCredits(imdbId) data['filming locations'] = getMovieLocations(imdbId)
data['movie connections'] = getMovieConnections(imdbId)
data['external reviews'] = getMovieExternalReviews(imdbId)
data['trivia'] = getMovieTrivia(imdbId)
data['keywords'] = getMovieKeywords(imdbId)
data['media'] = {}
data['media']['images'] = getMovieImages(imdbId)
data['media']['trailers'] = getMovieTrailers(imdbId)
return data
def parseBase(imdbId): def getMovieInfo(imdbId):
data = getUrl(getUrlBase(imdbId)) data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
info = dict() info = dict()
@ -76,6 +84,7 @@ def parseBase(imdbId):
#get Title #get Title
title = '' title = ''
year = ''
html_title = soup('div', {'id': 'tn15title'}) html_title = soup('div', {'id': 'tn15title'})
if not html_title: if not html_title:
html_title = soup('title') html_title = soup('title')
@ -84,8 +93,11 @@ def parseBase(imdbId):
html_title = html_title.replace('<br />', ' ').replace(' ', ' ') html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = htmldecode(html_title) title = htmldecode(html_title)
title = stripTags(title) title = stripTags(title)
title = re.sub('\(\d\d\d\d\)', '', title) year = findRegexp(title, '\((\d{4})\)')
title = re.sub('\(\d\d\d\d/I*\)', '', title) if not year:
year = findRegexp(title, '\((\d{4})')
title = re.sub('\(\d{4}\)', '', title)
title = re.sub('\(\d{4}/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '') title = title.replace(t, '')
title = title.strip() title = title.strip()
@ -94,14 +106,34 @@ def parseBase(imdbId):
if title.startswith('"') and title.endswith('"'): if title.startswith('"') and title.endswith('"'):
title = title[1:-1] title = title[1:-1]
info['title'] = title info['title'] = title
info['year'] = year
'''
#Rating
rating = findRegexp(data, '<b>(.*?)/10</b>')
if rating:
info['rating'] = int(float(rating) * 1000)
else:
info['rating'] = -1
#Votes
votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
if votes:
info['votes'] = int(votes.replace(',', ''))
else:
info['votes'] = -1
'''
return info return info
def getPoster(imdbId): def getMoviePoster(imdbId):
info = parseBase(imdbId) info = getMovieInfo(imdbId)
return info['poster'] return info['poster']
def getTitle(imdbId): def getMovieYear(imdbId):
info = parseBase(imdbId) info = getMovieInfo(imdbId)
return info['year']
def getMovieTitle(imdbId):
info = getMovieInfo(imdbId)
return info['title'] return info['title']
def creditList(data, section=None): def creditList(data, section=None):
@ -118,7 +150,7 @@ def creditList(data, section=None):
credits.append(c) credits.append(c)
return credits return credits
def getCredits(imdbId): def getMovieCredits(imdbId):
credits = dict() credits = dict()
url = "%s/fullcredits" % getUrlBase(imdbId) url = "%s/fullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
@ -216,6 +248,52 @@ def getMovieTrivia(imdbId):
trivia.append(t) trivia.append(t)
return trivia return trivia
def getMovieConnections(imdbId):
url = "%s/movieconnections" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
connections = {}
content = soup('div', {'id': 'tn15content'})[0]
blocks = str(content).split('<h5>')[1:]
for c in blocks:
connection = c.split('</h5>')[0]
cs = BeautifulSoup(c)
if connection:
#relation -> list of imdb ids
connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
return connections
def getMovieKeywords(imdbId):
url = "%s/keywords" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}):
k = htmldecode(key.string)
k = k.replace(u'\xa0', ' ')
keywords.append(k)
return keywords
def getMovieExternalReviews(imdbId):
url = "%s/externalreviews" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
ol = soup('ol')
if ol:
ol = ol[0]
ret = {}
for li in ol('li'):
try:
a = li('a')[0]
href = a.get('href')
txt = a.contents[0]
ret[href] = txt
except:
pass
return ret
return {}
'''the old code below''' '''the old code below'''
class IMDb: class IMDb:
@ -224,14 +302,10 @@ class IMDb:
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
self.businessUrl = "%sbusiness" % self.pageUrl self.businessUrl = "%sbusiness" % self.pageUrl
self.connectionsUrl = "%smovieconnections" % self.pageUrl
self.creditsUrl = "%sfullcredits" % self.pageUrl self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesUrl = "%sepisodes" % self.pageUrl self.episodesUrl = "%sepisodes" % self.pageUrl
self.keywordUrl = "%skeywords" % self.pageUrl
self.plotUrl = "%splotsummary" % self.pageUrl self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.locationUrl = "%slocations" % self.pageUrl
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
def getPage(self): def getPage(self):
return getUrlUnicode(self.pageUrl) return getUrlUnicode(self.pageUrl)
@ -293,7 +367,7 @@ class IMDb:
return parsed_value return parsed_value
def parseTitle(self): def parseTitle(self):
title = getTitle(self.imdb) title = getMovieTitle(self.imdb)
title = normalizeTitle(title) title = normalizeTitle(title)
if title.startswith('"') and title.find('"',1) > 0 and \ if title.startswith('"') and title.find('"',1) > 0 and \
title.find('"',1) == title.rfind('"'): title.find('"',1) == title.rfind('"'):
@ -328,7 +402,7 @@ class IMDb:
data = self.getPage() data = self.getPage()
IMDbDict ={} IMDbDict ={}
#Poster #Poster
IMDbDict['poster'] = getPoster(self.imdb) IMDbDict['poster'] = getMoviePoster(self.imdb)
if not IMDbDict['poster']: if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
#Title, Year #Title, Year
@ -373,16 +447,16 @@ class IMDb:
IMDbDict['tvshow'] = False IMDbDict['tvshow'] = False
IMDbDict['credits'] = self.getCredits() IMDbDict['credits'] = self.getCredits()
IMDbDict['plot'] = self.parsePlot() IMDbDict['plot'] = self.parsePlot()
IMDbDict['keywords'] = self.parseKeywords() IMDbDict['keywords'] = getMovieKeywords(self.imdb)
IMDbDict['trivia'] = getMovieTrivia(self.imdb) IMDbDict['trivia'] = getMovieTrivia(self.imdb)
IMDbDict['connections'] = self.parseConnections() IMDbDict['connections'] = getMovieConnections(self.imdb)
IMDbDict['locations'] = self.parseLocations() IMDbDict['locations'] = getMovieLocations(self.imdb)
IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['release_date'] = self.parseReleaseinfo()
IMDbDict['business'] = self.parseBusiness() IMDbDict['business'] = self.parseBusiness()
IMDbDict['reviews'] = self.parseExternalreviews() IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
IMDbDict['stills'] = getMovieStills(self.imdb) IMDbDict['stills'] = getMovieStills(self.imdb)
#IMDbDict['trailer'] = self.parseTrailer() #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
self.IMDbDict = IMDbDict self.IMDbDict = IMDbDict
if IMDbDict['episode_of']: if IMDbDict['episode_of']:
@ -452,42 +526,6 @@ class IMDb:
self.episodes = episodes self.episodes = episodes
return self.episodes return self.episodes
def parseLocations(self):
data = getUrlUnicode(self.locationUrl)
soup = BeautifulSoup(data)
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string))
self.locations = locations
return self.locations
def parseKeywords(self):
data = getUrlUnicode(self.keywordUrl)
soup = BeautifulSoup(data)
keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}):
k = htmldecode(key.string)
k = k.replace(u'\xa0', ' ')
keywords.append(k)
self.keywords = keywords
return self.keywords
def getConnections(self):
return getUrlUnicode(self.connectionsUrl)
def parseConnections(self):
connections = {}
soup = BeautifulSoup(self.getConnections())
content = soup('div', {'id': 'tn15content'})[0]
blocks = str(content).split('<h5>')[1:]
for c in blocks:
connection = c.split('</h5>')[0]
cs = BeautifulSoup(c)
if connection:
#relation -> list of imdb ids
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
return connections
def getReleaseinfo(self): def getReleaseinfo(self):
return getUrlUnicode(self.releaseinfoUrl) return getUrlUnicode(self.releaseinfoUrl)
@ -530,26 +568,6 @@ class IMDb:
business['profit'] = business['gross'] - business['budget'] business['profit'] = business['gross'] - business['budget']
return business return business
def getExternalreviews(self):
return getUrlUnicode(self.externalreviewsUrl)
def parseExternalreviews(self):
soup = BeautifulSoup(self.getExternalreviews())
ol = soup('ol')
if ol:
ol = ol[0]
ret = {}
for li in ol('li'):
try:
a = li('a')[0]
href = a.get('href')
txt = a.contents[0]
ret[href] = txt
except:
pass
return ret
return {}
def guess(title, director=''): def guess(title, director=''):
#FIXME: proper file -> title #FIXME: proper file -> title
title = title.split('-')[0] title = title.split('-')[0]