From 7bb7cf8beb3608361b57d12dfc3c03cbbbb56949 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Mon, 3 Sep 2007 16:56:10 +0000 Subject: [PATCH] parse external reviews --- scrapeit/imdb.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index 9a5a7b0..9f700a4 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -107,7 +107,9 @@ class IMDb: self.triviaUrl = "%strivia" % self.pageUrl self.locationSource = None self.locationUrl = "%slocations" % self.pageUrl - + self.externalreviewsSource = None + self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl + def getPage(self, forcereload = False): if forcereload or not self.pageSource: self.pageSource = read_url_utf8(self.pageUrl) @@ -269,6 +271,7 @@ class IMDb: IMDbDict['locations'] = self.parseLocations() IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() + IMDbDict['reviews'] = self.parseExternalreviews() self.IMDbDict = IMDbDict if IMDbDict['episode_of']: @@ -437,7 +440,7 @@ class IMDb: if forcereload or not self.businessSource: self.businessSource = read_url_utf8(self.businessUrl) return self.businessSource - + def parseBusiness(self): soup = BeautifulSoup(self.getBusiness()) business = {'budget': 0, 'gross': 0, 'profit': 0} @@ -457,7 +460,23 @@ class IMDb: if business['budget'] and business['gross']: business['profit'] = business['gross'] - business['budget'] return business + + def getExternalreviews(self, forcereload = False): + if forcereload or not self.externalreviewsSource: + self.externalreviewsSource = read_url_utf8(self.externalreviewsUrl) + return self.externalreviewsSource + def parseExternalreviews(self): + soup = BeautifulSoup(self.getExternalreviews()) + ol = soup('ol') + if ol: + ol = ol[0] + ret = {} + for li in ol('li'): + ret[li('a')[0].get('href')] = li('a')[0].contents[0] + return ret + return {} + def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0]