diff --git a/ox/dailymotion.py b/ox/dailymotion.py
new file mode 100644
index 0000000..cb5b576
--- /dev/null
+++ b/ox/dailymotion.py
@@ -0,0 +1,15 @@
+import re
+from urllib import unquote
+from oxutils.cache import getUrl
+
+def getVideoUrl(url):
+ data = getUrl(url)
+ video = re.compile('''video", "(.*?)"''').findall(data)
+ for v in video:
+ v = unquote(v).split('@@')[0]
+ return "http://www.dailymotion.com" + v
+ return ''
+
+if __name__ == '__main__':
+ print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
+ print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
diff --git a/ox/imdb.py b/ox/imdb.py
index 8c8332d..d31afde 100644
--- a/ox/imdb.py
+++ b/ox/imdb.py
@@ -38,14 +38,22 @@ def getUrlBase(imdbId):
def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId)
- data = dict()
- data['title'] = getTitle(imdbId)
- data['credits'] = getCredits(imdbId)
- data['poster'] = getPoster(imdbId)
- data['trailers'] = getMovieTrailers(imdbId)
- data['companyCredits'] = getMovieCompanyCredits(imdbId)
+ data = getMovieInfo(imdbId)
+ data['credits'] = getMovieCredits(imdbId)
+ data['poster'] = getMoviePoster(imdbId)
+ data['connections'] = getMovieConnections(imdbId)
+ data['company credits'] = getMovieCompanyCredits(imdbId)
+ data['filming locations'] = getMovieLocations(imdbId)
+ data['movie connections'] = getMovieConnections(imdbId)
+ data['external reviews'] = getMovieExternalReviews(imdbId)
+ data['trivia'] = getMovieTrivia(imdbId)
+ data['keywords'] = getMovieKeywords(imdbId)
+ data['media'] = {}
+ data['media']['images'] = getMovieImages(imdbId)
+ data['media']['trailers'] = getMovieTrailers(imdbId)
+ return data
-def parseBase(imdbId):
+def getMovieInfo(imdbId):
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
info = dict()
@@ -76,6 +84,7 @@ def parseBase(imdbId):
#get Title
title = ''
+ year = ''
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
@@ -84,8 +93,11 @@ def parseBase(imdbId):
html_title = html_title.replace('
', ' ').replace(' ', ' ')
title = htmldecode(html_title)
title = stripTags(title)
- title = re.sub('\(\d\d\d\d\)', '', title)
- title = re.sub('\(\d\d\d\d/I*\)', '', title)
+ year = findRegexp(title, '\((\d{4})\)')
+ if not year:
+ year = findRegexp(title, '\((\d{4})')
+ title = re.sub('\(\d{4}\)', '', title)
+ title = re.sub('\(\d{4}/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
@@ -94,14 +106,34 @@ def parseBase(imdbId):
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
info['title'] = title
+ info['year'] = year
+ '''
+ #Rating
+ rating = findRegexp(data, '(.*?)/10')
+ if rating:
+ info['rating'] = int(float(rating) * 1000)
+ else:
+ info['rating'] = -1
+
+ #Votes
+ votes = findRegexp(data, '\((.*?) votes\)')
+ if votes:
+ info['votes'] = int(votes.replace(',', ''))
+ else:
+ info['votes'] = -1
+ '''
return info
-def getPoster(imdbId):
- info = parseBase(imdbId)
+def getMoviePoster(imdbId):
+ info = getMovieInfo(imdbId)
return info['poster']
-def getTitle(imdbId):
- info = parseBase(imdbId)
+def getMovieYear(imdbId):
+ info = getMovieInfo(imdbId)
+ return info['year']
+
+def getMovieTitle(imdbId):
+ info = getMovieInfo(imdbId)
return info['title']
def creditList(data, section=None):
@@ -118,7 +150,7 @@ def creditList(data, section=None):
credits.append(c)
return credits
-def getCredits(imdbId):
+def getMovieCredits(imdbId):
credits = dict()
url = "%s/fullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
@@ -216,6 +248,52 @@ def getMovieTrivia(imdbId):
trivia.append(t)
return trivia
+def getMovieConnections(imdbId):
+ url = "%s/movieconnections" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ soup = BeautifulSoup(data)
+ connections = {}
+ content = soup('div', {'id': 'tn15content'})[0]
+ blocks = str(content).split('
')[1:]
+ for c in blocks:
+ connection = c.split('
')[0]
+ cs = BeautifulSoup(c)
+ if connection:
+ #relation -> list of imdb ids
+ connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
+ return connections
+
+def getMovieKeywords(imdbId):
+ url = "%s/keywords" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ soup = BeautifulSoup(data)
+ keywords = []
+ for key in soup('a', {'href': re.compile('^/keyword/')}):
+ k = htmldecode(key.string)
+ k = k.replace(u'\xa0', ' ')
+ keywords.append(k)
+ return keywords
+
+
+def getMovieExternalReviews(imdbId):
+ url = "%s/externalreviews" % getUrlBase(imdbId)
+ data = getUrlUnicode(url)
+ soup = BeautifulSoup(data)
+ ol = soup('ol')
+ if ol:
+ ol = ol[0]
+ ret = {}
+ for li in ol('li'):
+ try:
+ a = li('a')[0]
+ href = a.get('href')
+ txt = a.contents[0]
+ ret[href] = txt
+ except:
+ pass
+ return ret
+ return {}
+
'''the old code below'''
class IMDb:
@@ -224,14 +302,10 @@ class IMDb:
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
self.businessUrl = "%sbusiness" % self.pageUrl
- self.connectionsUrl = "%smovieconnections" % self.pageUrl
self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesUrl = "%sepisodes" % self.pageUrl
- self.keywordUrl = "%skeywords" % self.pageUrl
self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
- self.locationUrl = "%slocations" % self.pageUrl
- self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
def getPage(self):
return getUrlUnicode(self.pageUrl)
@@ -293,7 +367,7 @@ class IMDb:
return parsed_value
def parseTitle(self):
- title = getTitle(self.imdb)
+ title = getMovieTitle(self.imdb)
title = normalizeTitle(title)
if title.startswith('"') and title.find('"',1) > 0 and \
title.find('"',1) == title.rfind('"'):
@@ -328,7 +402,7 @@ class IMDb:
data = self.getPage()
IMDbDict ={}
#Poster
- IMDbDict['poster'] = getPoster(self.imdb)
+ IMDbDict['poster'] = getMoviePoster(self.imdb)
if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
#Title, Year
@@ -373,16 +447,16 @@ class IMDb:
IMDbDict['tvshow'] = False
IMDbDict['credits'] = self.getCredits()
IMDbDict['plot'] = self.parsePlot()
- IMDbDict['keywords'] = self.parseKeywords()
+ IMDbDict['keywords'] = getMovieKeywords(self.imdb)
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
- IMDbDict['connections'] = self.parseConnections()
- IMDbDict['locations'] = self.parseLocations()
+ IMDbDict['connections'] = getMovieConnections(self.imdb)
+ IMDbDict['locations'] = getMovieLocations(self.imdb)
IMDbDict['release_date'] = self.parseReleaseinfo()
IMDbDict['business'] = self.parseBusiness()
- IMDbDict['reviews'] = self.parseExternalreviews()
+ IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
IMDbDict['stills'] = getMovieStills(self.imdb)
- #IMDbDict['trailer'] = self.parseTrailer()
+ #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
self.IMDbDict = IMDbDict
if IMDbDict['episode_of']:
@@ -452,42 +526,6 @@ class IMDb:
self.episodes = episodes
return self.episodes
- def parseLocations(self):
- data = getUrlUnicode(self.locationUrl)
- soup = BeautifulSoup(data)
- locations = []
- for key in soup('a', {'href': re.compile('^/List')}):
- locations.append(htmldecode(key.string))
- self.locations = locations
- return self.locations
-
- def parseKeywords(self):
- data = getUrlUnicode(self.keywordUrl)
- soup = BeautifulSoup(data)
- keywords = []
- for key in soup('a', {'href': re.compile('^/keyword/')}):
- k = htmldecode(key.string)
- k = k.replace(u'\xa0', ' ')
- keywords.append(k)
- self.keywords = keywords
- return self.keywords
-
- def getConnections(self):
- return getUrlUnicode(self.connectionsUrl)
-
- def parseConnections(self):
- connections = {}
- soup = BeautifulSoup(self.getConnections())
- content = soup('div', {'id': 'tn15content'})[0]
- blocks = str(content).split('')[1:]
- for c in blocks:
- connection = c.split('
')[0]
- cs = BeautifulSoup(c)
- if connection:
- #relation -> list of imdb ids
- connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
- return connections
-
def getReleaseinfo(self):
return getUrlUnicode(self.releaseinfoUrl)
@@ -530,26 +568,6 @@ class IMDb:
business['profit'] = business['gross'] - business['budget']
return business
- def getExternalreviews(self):
- return getUrlUnicode(self.externalreviewsUrl)
-
- def parseExternalreviews(self):
- soup = BeautifulSoup(self.getExternalreviews())
- ol = soup('ol')
- if ol:
- ol = ol[0]
- ret = {}
- for li in ol('li'):
- try:
- a = li('a')[0]
- href = a.get('href')
- txt = a.contents[0]
- ret[href] = txt
- except:
- pass
- return ret
- return {}
-
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]