add one dailymotion function
This commit is contained in:
parent
40185f89ab
commit
7668ceafc1
2 changed files with 114 additions and 81 deletions
15
ox/dailymotion.py
Normal file
15
ox/dailymotion.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
import re
|
||||
from urllib import unquote
|
||||
from oxutils.cache import getUrl
|
||||
|
||||
def getVideoUrl(url):
|
||||
data = getUrl(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
return "http://www.dailymotion.com" + v
|
||||
return ''
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
|
||||
print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
|
180
ox/imdb.py
180
ox/imdb.py
|
@ -38,14 +38,22 @@ def getUrlBase(imdbId):
|
|||
|
||||
def getRawMovieData(imdbId):
|
||||
imdbId = normalizeImdbId(imdbId)
|
||||
data = dict()
|
||||
data['title'] = getTitle(imdbId)
|
||||
data['credits'] = getCredits(imdbId)
|
||||
data['poster'] = getPoster(imdbId)
|
||||
data['trailers'] = getMovieTrailers(imdbId)
|
||||
data['companyCredits'] = getMovieCompanyCredits(imdbId)
|
||||
data = getMovieInfo(imdbId)
|
||||
data['credits'] = getMovieCredits(imdbId)
|
||||
data['poster'] = getMoviePoster(imdbId)
|
||||
data['connections'] = getMovieConnections(imdbId)
|
||||
data['company credits'] = getMovieCompanyCredits(imdbId)
|
||||
data['filming locations'] = getMovieLocations(imdbId)
|
||||
data['movie connections'] = getMovieConnections(imdbId)
|
||||
data['external reviews'] = getMovieExternalReviews(imdbId)
|
||||
data['trivia'] = getMovieTrivia(imdbId)
|
||||
data['keywords'] = getMovieKeywords(imdbId)
|
||||
data['media'] = {}
|
||||
data['media']['images'] = getMovieImages(imdbId)
|
||||
data['media']['trailers'] = getMovieTrailers(imdbId)
|
||||
return data
|
||||
|
||||
def parseBase(imdbId):
|
||||
def getMovieInfo(imdbId):
|
||||
data = getUrl(getUrlBase(imdbId))
|
||||
soup = BeautifulSoup(data)
|
||||
info = dict()
|
||||
|
@ -76,6 +84,7 @@ def parseBase(imdbId):
|
|||
|
||||
#get Title
|
||||
title = ''
|
||||
year = ''
|
||||
html_title = soup('div', {'id': 'tn15title'})
|
||||
if not html_title:
|
||||
html_title = soup('title')
|
||||
|
@ -84,8 +93,11 @@ def parseBase(imdbId):
|
|||
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
||||
title = htmldecode(html_title)
|
||||
title = stripTags(title)
|
||||
title = re.sub('\(\d\d\d\d\)', '', title)
|
||||
title = re.sub('\(\d\d\d\d/I*\)', '', title)
|
||||
year = findRegexp(title, '\((\d{4})\)')
|
||||
if not year:
|
||||
year = findRegexp(title, '\((\d{4})')
|
||||
title = re.sub('\(\d{4}\)', '', title)
|
||||
title = re.sub('\(\d{4}/I*\)', '', title)
|
||||
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||
title = title.replace(t, '')
|
||||
title = title.strip()
|
||||
|
@ -94,14 +106,34 @@ def parseBase(imdbId):
|
|||
if title.startswith('"') and title.endswith('"'):
|
||||
title = title[1:-1]
|
||||
info['title'] = title
|
||||
info['year'] = year
|
||||
'''
|
||||
#Rating
|
||||
rating = findRegexp(data, '<b>(.*?)/10</b>')
|
||||
if rating:
|
||||
info['rating'] = int(float(rating) * 1000)
|
||||
else:
|
||||
info['rating'] = -1
|
||||
|
||||
#Votes
|
||||
votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
|
||||
if votes:
|
||||
info['votes'] = int(votes.replace(',', ''))
|
||||
else:
|
||||
info['votes'] = -1
|
||||
'''
|
||||
return info
|
||||
|
||||
def getPoster(imdbId):
|
||||
info = parseBase(imdbId)
|
||||
def getMoviePoster(imdbId):
|
||||
info = getMovieInfo(imdbId)
|
||||
return info['poster']
|
||||
|
||||
def getTitle(imdbId):
|
||||
info = parseBase(imdbId)
|
||||
def getMovieYear(imdbId):
|
||||
info = getMovieInfo(imdbId)
|
||||
return info['year']
|
||||
|
||||
def getMovieTitle(imdbId):
|
||||
info = getMovieInfo(imdbId)
|
||||
return info['title']
|
||||
|
||||
def creditList(data, section=None):
|
||||
|
@ -118,7 +150,7 @@ def creditList(data, section=None):
|
|||
credits.append(c)
|
||||
return credits
|
||||
|
||||
def getCredits(imdbId):
|
||||
def getMovieCredits(imdbId):
|
||||
credits = dict()
|
||||
url = "%s/fullcredits" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
|
@ -216,6 +248,52 @@ def getMovieTrivia(imdbId):
|
|||
trivia.append(t)
|
||||
return trivia
|
||||
|
||||
def getMovieConnections(imdbId):
|
||||
url = "%s/movieconnections" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
soup = BeautifulSoup(data)
|
||||
connections = {}
|
||||
content = soup('div', {'id': 'tn15content'})[0]
|
||||
blocks = str(content).split('<h5>')[1:]
|
||||
for c in blocks:
|
||||
connection = c.split('</h5>')[0]
|
||||
cs = BeautifulSoup(c)
|
||||
if connection:
|
||||
#relation -> list of imdb ids
|
||||
connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
|
||||
return connections
|
||||
|
||||
def getMovieKeywords(imdbId):
|
||||
url = "%s/keywords" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
soup = BeautifulSoup(data)
|
||||
keywords = []
|
||||
for key in soup('a', {'href': re.compile('^/keyword/')}):
|
||||
k = htmldecode(key.string)
|
||||
k = k.replace(u'\xa0', ' ')
|
||||
keywords.append(k)
|
||||
return keywords
|
||||
|
||||
|
||||
def getMovieExternalReviews(imdbId):
|
||||
url = "%s/externalreviews" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
soup = BeautifulSoup(data)
|
||||
ol = soup('ol')
|
||||
if ol:
|
||||
ol = ol[0]
|
||||
ret = {}
|
||||
for li in ol('li'):
|
||||
try:
|
||||
a = li('a')[0]
|
||||
href = a.get('href')
|
||||
txt = a.contents[0]
|
||||
ret[href] = txt
|
||||
except:
|
||||
pass
|
||||
return ret
|
||||
return {}
|
||||
|
||||
'''the old code below'''
|
||||
|
||||
class IMDb:
|
||||
|
@ -224,14 +302,10 @@ class IMDb:
|
|||
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
||||
|
||||
self.businessUrl = "%sbusiness" % self.pageUrl
|
||||
self.connectionsUrl = "%smovieconnections" % self.pageUrl
|
||||
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
||||
self.episodesUrl = "%sepisodes" % self.pageUrl
|
||||
self.keywordUrl = "%skeywords" % self.pageUrl
|
||||
self.plotUrl = "%splotsummary" % self.pageUrl
|
||||
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
||||
self.locationUrl = "%slocations" % self.pageUrl
|
||||
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
||||
|
||||
def getPage(self):
|
||||
return getUrlUnicode(self.pageUrl)
|
||||
|
@ -293,7 +367,7 @@ class IMDb:
|
|||
return parsed_value
|
||||
|
||||
def parseTitle(self):
|
||||
title = getTitle(self.imdb)
|
||||
title = getMovieTitle(self.imdb)
|
||||
title = normalizeTitle(title)
|
||||
if title.startswith('"') and title.find('"',1) > 0 and \
|
||||
title.find('"',1) == title.rfind('"'):
|
||||
|
@ -328,7 +402,7 @@ class IMDb:
|
|||
data = self.getPage()
|
||||
IMDbDict ={}
|
||||
#Poster
|
||||
IMDbDict['poster'] = getPoster(self.imdb)
|
||||
IMDbDict['poster'] = getMoviePoster(self.imdb)
|
||||
if not IMDbDict['poster']:
|
||||
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
||||
#Title, Year
|
||||
|
@ -373,16 +447,16 @@ class IMDb:
|
|||
IMDbDict['tvshow'] = False
|
||||
IMDbDict['credits'] = self.getCredits()
|
||||
IMDbDict['plot'] = self.parsePlot()
|
||||
IMDbDict['keywords'] = self.parseKeywords()
|
||||
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
|
||||
|
||||
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
||||
IMDbDict['connections'] = self.parseConnections()
|
||||
IMDbDict['locations'] = self.parseLocations()
|
||||
IMDbDict['connections'] = getMovieConnections(self.imdb)
|
||||
IMDbDict['locations'] = getMovieLocations(self.imdb)
|
||||
IMDbDict['release_date'] = self.parseReleaseinfo()
|
||||
IMDbDict['business'] = self.parseBusiness()
|
||||
IMDbDict['reviews'] = self.parseExternalreviews()
|
||||
IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
|
||||
IMDbDict['stills'] = getMovieStills(self.imdb)
|
||||
#IMDbDict['trailer'] = self.parseTrailer()
|
||||
#IMDbDict['trailer'] = getMovieTrailer(self.imdb)
|
||||
self.IMDbDict = IMDbDict
|
||||
|
||||
if IMDbDict['episode_of']:
|
||||
|
@ -452,42 +526,6 @@ class IMDb:
|
|||
self.episodes = episodes
|
||||
return self.episodes
|
||||
|
||||
def parseLocations(self):
|
||||
data = getUrlUnicode(self.locationUrl)
|
||||
soup = BeautifulSoup(data)
|
||||
locations = []
|
||||
for key in soup('a', {'href': re.compile('^/List')}):
|
||||
locations.append(htmldecode(key.string))
|
||||
self.locations = locations
|
||||
return self.locations
|
||||
|
||||
def parseKeywords(self):
|
||||
data = getUrlUnicode(self.keywordUrl)
|
||||
soup = BeautifulSoup(data)
|
||||
keywords = []
|
||||
for key in soup('a', {'href': re.compile('^/keyword/')}):
|
||||
k = htmldecode(key.string)
|
||||
k = k.replace(u'\xa0', ' ')
|
||||
keywords.append(k)
|
||||
self.keywords = keywords
|
||||
return self.keywords
|
||||
|
||||
def getConnections(self):
|
||||
return getUrlUnicode(self.connectionsUrl)
|
||||
|
||||
def parseConnections(self):
|
||||
connections = {}
|
||||
soup = BeautifulSoup(self.getConnections())
|
||||
content = soup('div', {'id': 'tn15content'})[0]
|
||||
blocks = str(content).split('<h5>')[1:]
|
||||
for c in blocks:
|
||||
connection = c.split('</h5>')[0]
|
||||
cs = BeautifulSoup(c)
|
||||
if connection:
|
||||
#relation -> list of imdb ids
|
||||
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
|
||||
return connections
|
||||
|
||||
def getReleaseinfo(self):
|
||||
return getUrlUnicode(self.releaseinfoUrl)
|
||||
|
||||
|
@ -530,26 +568,6 @@ class IMDb:
|
|||
business['profit'] = business['gross'] - business['budget']
|
||||
return business
|
||||
|
||||
def getExternalreviews(self):
|
||||
return getUrlUnicode(self.externalreviewsUrl)
|
||||
|
||||
def parseExternalreviews(self):
|
||||
soup = BeautifulSoup(self.getExternalreviews())
|
||||
ol = soup('ol')
|
||||
if ol:
|
||||
ol = ol[0]
|
||||
ret = {}
|
||||
for li in ol('li'):
|
||||
try:
|
||||
a = li('a')[0]
|
||||
href = a.get('href')
|
||||
txt = a.contents[0]
|
||||
ret[href] = txt
|
||||
except:
|
||||
pass
|
||||
return ret
|
||||
return {}
|
||||
|
||||
def guess(title, director=''):
|
||||
#FIXME: proper file -> title
|
||||
title = title.split('-')[0]
|
||||
|
|
Loading…
Reference in a new issue