diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index 72b0728..23f3eb1 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -110,7 +110,9 @@ class IMDb: self.locationUrl = "%slocations" % self.pageUrl self.externalreviewsSource = None self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl - + self.trailerSource = None + self.trailerUrl = "%strailers" % self.pageUrl + def getPage(self, forcereload = False): if forcereload or not self.pageSource: self.pageSource = read_url_utf8(self.pageUrl) @@ -272,6 +274,7 @@ class IMDb: IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() IMDbDict['reviews'] = self.parseExternalreviews() + #IMDbDict['trailers'] = self.parseTrailers() self.IMDbDict = IMDbDict if IMDbDict['episode_of']: @@ -474,7 +477,7 @@ class IMDb: if forcereload or not self.externalreviewsSource: self.externalreviewsSource = read_url_utf8(self.externalreviewsUrl) return self.externalreviewsSource - + def parseExternalreviews(self): soup = BeautifulSoup(self.getExternalreviews()) ol = soup('ol') @@ -491,7 +494,25 @@ class IMDb: pass return ret return {} - + + def getTrailers(self, forcereload = False): + if forcereload or not self.trailerSource: + self.trailerSource = read_url_utf8(self.trailerUrl) + return self.trailerSource + + def parseTrailers(self): + ret = {} + soup = BeautifulSoup(self.getTrailers()) + for p in soup('p'): + if p('a') and p.firstText(): + a = p('a')[0] + href = a['href'] + if href and href.startswith('http'): + title = a.string + title = title.replace('www.', '') + ret[href] = title + return ret + def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0] diff --git a/scrapeit/yahootrailers.py b/scrapeit/yahootrailers.py new file mode 100644 index 0000000..ee55fd2 --- /dev/null +++ b/scrapeit/yahootrailers.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +# -*- Mode: Python; -*- +# vi:si:et:sw=2:sts=2:ts=2 +from urllib import quote +import re + +from BeautifulSoup import BeautifulSoup + +from utils import read_url, stripTags + + +def trailerByTitle(title): + title = title.strip() + url = "http://movies.yahoo.com/mv/search?p=%s" % quote(title) + data = read_url(url) + soup = BeautifulSoup(data) + movies = soup('a', {'href': re.compile('http://movies.yahoo.com/movie.*?')}) + if movies and movies[0].firstText() and title in movies[0].firstText(): + info = movies[0]['href'] + trailer = info.replace('/info', '/video') + data = read_url(info) + if trailer in data: + return trailer + return '' + \ No newline at end of file