From 8886cfe8d31ec360a096ecb477ac8d8ba143027e Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 17 Jun 2008 13:07:53 +0200 Subject: [PATCH] reduce parsing time drastically thanks to updated encoding detection and removing BeautifulSoup --- ox/imdb.py | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/ox/imdb.py b/ox/imdb.py index 18d9c19..2c1f325 100644 --- a/ox/imdb.py +++ b/ox/imdb.py @@ -18,6 +18,18 @@ from oxutils.normalize import normalizeTitle, normalizeImdbId import google +_timer = -1 +_timer_last = -1 +def debugTime(message=''): + global _timer, _timer_last + if _timer == -1: + _timer = time.time() + if _timer_last == -1: + _timer_last = time.time() + now = time.time() + print message," since last: %0.2f total time: %0.2f" % (now-_timer_last, now-_timer) + _timer_last = now + def getMovieId(title, director='', year=''): ''' >>> getMovieId('The Matrix') @@ -46,7 +58,6 @@ def getRawMovieData(imdbId): data = getMovieInfo(imdbId) data['credits'] = getMovieCredits(imdbId) data['poster'] = getMoviePoster(imdbId) - data['connections'] = getMovieConnections(imdbId) data['company credits'] = getMovieCompanyCredits(imdbId) data['filming locations'] = getMovieLocations(imdbId) data['movie connections'] = getMovieConnections(imdbId) @@ -272,28 +283,20 @@ def getMovieTrivia(imdbId): def getMovieConnections(imdbId): url = "%s/movieconnections" % getUrlBase(imdbId) - data = getUrlUnicode(url) - soup = BeautifulSoup(data) - connections = {} - content = soup('div', {'id': 'tn15content'})[0] - blocks = unicode(content).split('
')[1:] - for c in blocks: - connection = c.split('
')[0] - cs = BeautifulSoup(c) - if connection: - #relation -> list of imdb ids - connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})] + data = getUrl(url) + connections={} + for c in re.compile('''
(.*?)
(.*?)\n\n''', re.DOTALL).findall(data): + connections[unicode(c[0])] = re.compile('''''').findall(c[1]) return connections def getMovieKeywords(imdbId): url = "%s/keywords" % getUrlBase(imdbId) data = getUrlUnicode(url) - soup = BeautifulSoup(data) keywords = [] - for key in soup('a', {'href': re.compile('^/keyword/')}): - k = decodeHtml(key.string) - k = k.replace(u'\xa0', ' ') - keywords.append(k) + for keyword in re.compile('''(.*?)''').findall(data): + keyword = decodeHtml(keyword) + keyword = keyword.replace(u'\xa0', ' ') + keywords.append(keyword) return keywords def getMovieExternalReviews(imdbId): @@ -591,7 +594,6 @@ class IMDb: IMDbDict['credits'] = self.getCredits() IMDbDict['plot'] = getMoviePlot(self.imdb) IMDbDict['keywords'] = getMovieKeywords(self.imdb) - IMDbDict['trivia'] = getMovieTrivia(self.imdb) IMDbDict['connections'] = getMovieConnections(self.imdb) IMDbDict['locations'] = getMovieLocations(self.imdb)