From 8886cfe8d31ec360a096ecb477ac8d8ba143027e Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Tue, 17 Jun 2008 13:07:53 +0200
Subject: [PATCH] reduce parsing time drastically thanks to updated encoding
 detection and removing BeautifulSoup

---
 ox/imdb.py | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)
diff --git a/ox/imdb.py b/ox/imdb.py
index 18d9c19..2c1f325 100644
--- a/ox/imdb.py
+++ b/ox/imdb.py
@@ -18,6 +18,18 @@ from oxutils.normalize import normalizeTitle, normalizeImdbId
 
 import google
 
+_timer = -1
+_timer_last = -1
+def debugTime(message=''):
+  global _timer, _timer_last
+  if _timer == -1:
+    _timer = time.time()
+  if _timer_last == -1:
+    _timer_last = time.time()
+  now = time.time()
+  print message," since last: %0.2f total time: %0.2f" % (now-_timer_last, now-_timer)
+  _timer_last = now
+
 def getMovieId(title, director='', year=''):
   '''
   >>> getMovieId('The Matrix')
@@ -46,7 +58,6 @@ def getRawMovieData(imdbId):
   data = getMovieInfo(imdbId)
   data['credits'] = getMovieCredits(imdbId)
   data['poster'] = getMoviePoster(imdbId)
-  data['connections'] = getMovieConnections(imdbId)
   data['company credits'] = getMovieCompanyCredits(imdbId)
   data['filming locations'] = getMovieLocations(imdbId)
   data['movie connections'] = getMovieConnections(imdbId)
@@ -272,28 +283,20 @@ def getMovieTrivia(imdbId):
 
 def getMovieConnections(imdbId):
   url = "%s/movieconnections" % getUrlBase(imdbId)
-  data = getUrlUnicode(url)
-  soup = BeautifulSoup(data)
-  connections = {}
-  content = soup('div', {'id': 'tn15content'})[0]
-  blocks = unicode(content).split('<h5>')[1:]
-  for c in blocks:
-    connection = c.split('</h5>')[0]
-    cs = BeautifulSoup(c)
-    if connection:
-      #relation -> list of imdb ids
-      connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
+  data = getUrl(url)
+  connections={}
+  for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
+    connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
   return connections
 
 def getMovieKeywords(imdbId):
   url = "%s/keywords" % getUrlBase(imdbId)
   data = getUrlUnicode(url)
-  soup = BeautifulSoup(data)
   keywords = []
-  for key in soup('a', {'href': re.compile('^/keyword/')}):
-    k = decodeHtml(key.string)
-    k = k.replace(u'\xa0', ' ')
-    keywords.append(k)
+  for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
+    keyword = decodeHtml(keyword)
+    keyword = keyword.replace(u'\xa0', ' ')
+    keywords.append(keyword)
   return keywords
 
 def getMovieExternalReviews(imdbId):
@@ -591,7 +594,6 @@ class IMDb:
     IMDbDict['credits'] = self.getCredits()
     IMDbDict['plot'] = getMoviePlot(self.imdb)
     IMDbDict['keywords'] = getMovieKeywords(self.imdb)
-
     IMDbDict['trivia'] = getMovieTrivia(self.imdb)
     IMDbDict['connections'] = getMovieConnections(self.imdb)
     IMDbDict['locations'] = getMovieLocations(self.imdb)