parse external reviews

This commit is contained in:
j 2007-09-03 16:56:10 +00:00
parent afe518fc7d
commit 7bb7cf8beb

View file

@ -107,7 +107,9 @@ class IMDb:
self.triviaUrl = "%strivia" % self.pageUrl
self.locationSource = None
self.locationUrl = "%slocations" % self.pageUrl
self.externalreviewsSource = None
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
def getPage(self, forcereload = False):
if forcereload or not self.pageSource:
self.pageSource = read_url_utf8(self.pageUrl)
@ -269,6 +271,7 @@ class IMDb:
IMDbDict['locations'] = self.parseLocations()
IMDbDict['release_date'] = self.parseReleaseinfo()
IMDbDict['business'] = self.parseBusiness()
IMDbDict['reviews'] = self.parseExternalreviews()
self.IMDbDict = IMDbDict
if IMDbDict['episode_of']:
@ -437,7 +440,7 @@ class IMDb:
if forcereload or not self.businessSource:
self.businessSource = read_url_utf8(self.businessUrl)
return self.businessSource
def parseBusiness(self):
soup = BeautifulSoup(self.getBusiness())
business = {'budget': 0, 'gross': 0, 'profit': 0}
@ -457,7 +460,23 @@ class IMDb:
if business['budget'] and business['gross']:
business['profit'] = business['gross'] - business['budget']
return business
def getExternalreviews(self, forcereload = False):
if forcereload or not self.externalreviewsSource:
self.externalreviewsSource = read_url_utf8(self.externalreviewsUrl)
return self.externalreviewsSource
def parseExternalreviews(self):
soup = BeautifulSoup(self.getExternalreviews())
ol = soup('ol')
if ol:
ol = ol[0]
ret = {}
for li in ol('li'):
ret[li('a')[0].get('href')] = li('a')[0].contents[0]
return ret
return {}
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]