diff --git a/ox/dailymotion.py b/ox/dailymotion.py new file mode 100644 index 0000000..cb5b576 --- /dev/null +++ b/ox/dailymotion.py @@ -0,0 +1,15 @@ +import re +from urllib import unquote +from oxutils.cache import getUrl + +def getVideoUrl(url): + data = getUrl(url) + video = re.compile('''video", "(.*?)"''').findall(data) + for v in video: + v = unquote(v).split('@@')[0] + return "http://www.dailymotion.com" + v + return '' + +if __name__ == '__main__': + print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms') + print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms') diff --git a/ox/imdb.py b/ox/imdb.py index 8c8332d..d31afde 100644 --- a/ox/imdb.py +++ b/ox/imdb.py @@ -38,14 +38,22 @@ def getUrlBase(imdbId): def getRawMovieData(imdbId): imdbId = normalizeImdbId(imdbId) - data = dict() - data['title'] = getTitle(imdbId) - data['credits'] = getCredits(imdbId) - data['poster'] = getPoster(imdbId) - data['trailers'] = getMovieTrailers(imdbId) - data['companyCredits'] = getMovieCompanyCredits(imdbId) + data = getMovieInfo(imdbId) + data['credits'] = getMovieCredits(imdbId) + data['poster'] = getMoviePoster(imdbId) + data['connections'] = getMovieConnections(imdbId) + data['company credits'] = getMovieCompanyCredits(imdbId) + data['filming locations'] = getMovieLocations(imdbId) + data['movie connections'] = getMovieConnections(imdbId) + data['external reviews'] = getMovieExternalReviews(imdbId) + data['trivia'] = getMovieTrivia(imdbId) + data['keywords'] = getMovieKeywords(imdbId) + data['media'] = {} + data['media']['images'] = getMovieImages(imdbId) + data['media']['trailers'] = getMovieTrailers(imdbId) + return data -def parseBase(imdbId): +def getMovieInfo(imdbId): data = getUrl(getUrlBase(imdbId)) soup = BeautifulSoup(data) info = dict() @@ -76,6 +84,7 @@ def parseBase(imdbId): #get Title title = '' + year = '' html_title = soup('div', {'id': 'tn15title'}) if not html_title: html_title = soup('title') @@ -84,8 +93,11 @@ def parseBase(imdbId): html_title = html_title.replace('
', ' ').replace(' ', ' ') title = htmldecode(html_title) title = stripTags(title) - title = re.sub('\(\d\d\d\d\)', '', title) - title = re.sub('\(\d\d\d\d/I*\)', '', title) + year = findRegexp(title, '\((\d{4})\)') + if not year: + year = findRegexp(title, '\((\d{4})') + title = re.sub('\(\d{4}\)', '', title) + title = re.sub('\(\d{4}/I*\)', '', title) for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') title = title.strip() @@ -94,14 +106,34 @@ def parseBase(imdbId): if title.startswith('"') and title.endswith('"'): title = title[1:-1] info['title'] = title + info['year'] = year + ''' + #Rating + rating = findRegexp(data, '(.*?)/10') + if rating: + info['rating'] = int(float(rating) * 1000) + else: + info['rating'] = -1 + + #Votes + votes = findRegexp(data, '\((.*?) votes\)') + if votes: + info['votes'] = int(votes.replace(',', '')) + else: + info['votes'] = -1 + ''' return info -def getPoster(imdbId): - info = parseBase(imdbId) +def getMoviePoster(imdbId): + info = getMovieInfo(imdbId) return info['poster'] -def getTitle(imdbId): - info = parseBase(imdbId) +def getMovieYear(imdbId): + info = getMovieInfo(imdbId) + return info['year'] + +def getMovieTitle(imdbId): + info = getMovieInfo(imdbId) return info['title'] def creditList(data, section=None): @@ -118,7 +150,7 @@ def creditList(data, section=None): credits.append(c) return credits -def getCredits(imdbId): +def getMovieCredits(imdbId): credits = dict() url = "%s/fullcredits" % getUrlBase(imdbId) data = getUrlUnicode(url) @@ -216,6 +248,52 @@ def getMovieTrivia(imdbId): trivia.append(t) return trivia +def getMovieConnections(imdbId): + url = "%s/movieconnections" % getUrlBase(imdbId) + data = getUrlUnicode(url) + soup = BeautifulSoup(data) + connections = {} + content = soup('div', {'id': 'tn15content'})[0] + blocks = str(content).split('
')[1:] + for c in blocks: + connection = c.split('
')[0] + cs = BeautifulSoup(c) + if connection: + #relation -> list of imdb ids + connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})] + return connections + +def getMovieKeywords(imdbId): + url = "%s/keywords" % getUrlBase(imdbId) + data = getUrlUnicode(url) + soup = BeautifulSoup(data) + keywords = [] + for key in soup('a', {'href': re.compile('^/keyword/')}): + k = htmldecode(key.string) + k = k.replace(u'\xa0', ' ') + keywords.append(k) + return keywords + + +def getMovieExternalReviews(imdbId): + url = "%s/externalreviews" % getUrlBase(imdbId) + data = getUrlUnicode(url) + soup = BeautifulSoup(data) + ol = soup('ol') + if ol: + ol = ol[0] + ret = {} + for li in ol('li'): + try: + a = li('a')[0] + href = a.get('href') + txt = a.contents[0] + ret[href] = txt + except: + pass + return ret + return {} + '''the old code below''' class IMDb: @@ -224,14 +302,10 @@ class IMDb: self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.businessUrl = "%sbusiness" % self.pageUrl - self.connectionsUrl = "%smovieconnections" % self.pageUrl self.creditsUrl = "%sfullcredits" % self.pageUrl self.episodesUrl = "%sepisodes" % self.pageUrl - self.keywordUrl = "%skeywords" % self.pageUrl self.plotUrl = "%splotsummary" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl - self.locationUrl = "%slocations" % self.pageUrl - self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl def getPage(self): return getUrlUnicode(self.pageUrl) @@ -293,7 +367,7 @@ class IMDb: return parsed_value def parseTitle(self): - title = getTitle(self.imdb) + title = getMovieTitle(self.imdb) title = normalizeTitle(title) if title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): @@ -328,7 +402,7 @@ class IMDb: data = self.getPage() IMDbDict ={} #Poster - IMDbDict['poster'] = getPoster(self.imdb) + IMDbDict['poster'] = getMoviePoster(self.imdb) if not IMDbDict['poster']: IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' #Title, Year @@ -373,16 +447,16 @@ class IMDb: IMDbDict['tvshow'] = False IMDbDict['credits'] = self.getCredits() IMDbDict['plot'] = self.parsePlot() - IMDbDict['keywords'] = self.parseKeywords() + IMDbDict['keywords'] = getMovieKeywords(self.imdb) IMDbDict['trivia'] = getMovieTrivia(self.imdb) - IMDbDict['connections'] = self.parseConnections() - IMDbDict['locations'] = self.parseLocations() + IMDbDict['connections'] = getMovieConnections(self.imdb) + IMDbDict['locations'] = getMovieLocations(self.imdb) IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() - IMDbDict['reviews'] = self.parseExternalreviews() + IMDbDict['reviews'] = getMovieExternalReviews(self.imdb) IMDbDict['stills'] = getMovieStills(self.imdb) - #IMDbDict['trailer'] = self.parseTrailer() + #IMDbDict['trailer'] = getMovieTrailer(self.imdb) self.IMDbDict = IMDbDict if IMDbDict['episode_of']: @@ -452,42 +526,6 @@ class IMDb: self.episodes = episodes return self.episodes - def parseLocations(self): - data = getUrlUnicode(self.locationUrl) - soup = BeautifulSoup(data) - locations = [] - for key in soup('a', {'href': re.compile('^/List')}): - locations.append(htmldecode(key.string)) - self.locations = locations - return self.locations - - def parseKeywords(self): - data = getUrlUnicode(self.keywordUrl) - soup = BeautifulSoup(data) - keywords = [] - for key in soup('a', {'href': re.compile('^/keyword/')}): - k = htmldecode(key.string) - k = k.replace(u'\xa0', ' ') - keywords.append(k) - self.keywords = keywords - return self.keywords - - def getConnections(self): - return getUrlUnicode(self.connectionsUrl) - - def parseConnections(self): - connections = {} - soup = BeautifulSoup(self.getConnections()) - content = soup('div', {'id': 'tn15content'})[0] - blocks = str(content).split('
')[1:] - for c in blocks: - connection = c.split('
')[0] - cs = BeautifulSoup(c) - if connection: - #relation -> list of imdb ids - connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] - return connections - def getReleaseinfo(self): return getUrlUnicode(self.releaseinfoUrl) @@ -530,26 +568,6 @@ class IMDb: business['profit'] = business['gross'] - business['budget'] return business - def getExternalreviews(self): - return getUrlUnicode(self.externalreviewsUrl) - - def parseExternalreviews(self): - soup = BeautifulSoup(self.getExternalreviews()) - ol = soup('ol') - if ol: - ol = ol[0] - ret = {} - for li in ol('li'): - try: - a = li('a')[0] - href = a.get('href') - txt = a.contents[0] - ret[href] = txt - except: - pass - return ret - return {} - def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0]