From c10645f1142466908294c2e41fbc505b04879517 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 17 Jul 2007 10:37:14 +0000 Subject: [PATCH] parse locations --- scrapeit/imdb.py | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index 0dc0c31..0a1aaa4 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -19,31 +19,11 @@ import chardet cache_base = "/var/cache/scrapeit/cache/" def read_url_utf8(url): - path = os.path.join(cache_base, url.replace('http://','')) - if path.endswith('/'): - path = "%sindex.html" % path - if os.path.isdir(path): - path = "%s/index.html" % path - if os.path.exists(path): - f = open(path) - data = f.read() - encoding = chardet.detect(data)['encoding'] - if not encoding: encoding = 'latin-1' - f.close() - data = unicode(data, encoding) - return data - else: - data = utils.read_url(url) - folder = os.path.dirname(path) - if not os.path.exists(folder): - os.makedirs(folder) - f = open(path, 'w') - f.write(data) - f.close() - encoding = chardet.detect(data)['encoding'] - if not encoding: encoding = 'latin-1' - data = unicode(data, encoding) - return data + data = read_url(url) + encoding = chardet.detect(data)['encoding'] + if not encoding: encoding = 'latin-1' + data = unicode(data, encoding) + return data def read_url(url): path = os.path.join(cache_base, url.replace('http://','')) @@ -124,6 +104,8 @@ class IMDb: self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.triviaSource = None self.triviaUrl = "%strivia" % self.pageUrl + self.locationSource = None + self.locationUrl = "%slocations" % self.pageUrl def getPage(self, forcereload = False): if forcereload or not self.pageSource: @@ -264,6 +246,7 @@ class IMDb: IMDbDict['trivia'] = self.parseTrivia() IMDbDict['connections'] = self.parseConnections() + IMDbDict['locations'] = self.parseLocations() IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['business'] = self.parseBusiness() self.IMDbDict = IMDbDict @@ -341,6 +324,19 @@ class IMDb: self.episodes = episodes return self.episodes + def getLocations(self, forcereload = False): + if forcereload or not self.locationSource: + self.keywordSource = read_url_utf8(self.locationUrl) + return self.keywordSource + + def parseLocations(self): + soup = BeautifulSoup(self.getLocations()) + locations = [] + for key in soup('a', {'href': re.compile('^/List')}): + locations.append(htmldecode(key.string)) + self.locations = locations + return self.locations + def getKeywords(self, forcereload = False): if forcereload or not self.keywordSource: self.keywordSource = read_url_utf8(self.keywordUrl)