From bf3e51df2381d13db3a149c73dd59a76b0c74d7c Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Mon, 1 Jun 2009 15:11:22 +0200
Subject: [PATCH] cleanup urls

---
 oxweb/imdb.py | 45 ++++++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 17 deletions(-)
diff --git a/oxweb/imdb.py b/oxweb/imdb.py
index a642421..dcc30e0 100644
--- a/oxweb/imdb.py
+++ b/oxweb/imdb.py
@@ -22,6 +22,17 @@ import google
 def getUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1):
     return oxlib.cache.getUrlUnicode(url, data, headers, timeout)
 
+'''
+check if result is valid while updating
+def validate(result, header):
+    return header['status'] == u'200'
+
+try:
+    d = oxlib.cache.getUrlUnicode(url, data, headers, timeout=0, valid=validate)
+except oxlib.cache.InvalidResult, e:
+    print e.headers
+
+'''
 def getMovieId(title, director='', year=''):
     '''
     >>> getMovieId('The Matrix')
@@ -43,7 +54,7 @@ def getMovieData(imdbId):
 
 # internal functions below
 def getUrlBase(imdbId):
-    return "http://www.imdb.com/title/tt%s" % imdbId
+    return "http://www.imdb.com/title/tt%s/" % imdbId
 
 def getRawMovieData(imdbId):
     imdbId = normalizeImdbId(imdbId)
@@ -226,7 +237,7 @@ def creditList(data, section=None):
 
 def getMovieCredits(imdbId):
     credits = dict()
-    url = "%s/fullcredits" % getUrlBase(imdbId)
+    url = "%sfullcredits" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     groups = data.split('<h5>')
     for g in groups:
@@ -236,7 +247,7 @@ def getMovieCredits(imdbId):
     return credits
 
 def getMovieTrailers(imdbId):
-    url = "%s/trailers" % getUrlBase(imdbId)
+    url = "%strailers" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     soup = BeautifulSoup(data)
     videos = soup('div', {'class':"video-gallery"})
@@ -253,20 +264,20 @@ def getMovieTrailers(imdbId):
     return trailers
 
 def getMovieQuotes(imdbId):
-    url = "%s/quotes" % getUrlBase(imdbId)
+    url = "%squotes" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
     quotes = [(q[0].strip(),q[1].strip())  for q in quotes]
     return quotes
 
 def getMoviePlot(imdbId):
-    url = "%s/plotsummary" % getUrlBase(imdbId)
+    url = "%splotsummary" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0]
     return plot.strip()
 
 def getMovieTechnical(imdbId):
-    url = "%s/technical" % getUrlBase(imdbId)
+    url = "%stechnical" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     results = {}
     for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
@@ -274,7 +285,7 @@ def getMovieTechnical(imdbId):
     return results
 
 def getMovieCompanyCredits(imdbId):
-    url = "%s/companycredits" % getUrlBase(imdbId)
+    url = "%scompanycredits" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     results = {}
     for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
@@ -284,7 +295,7 @@ def getMovieCompanyCredits(imdbId):
     return results
 
 def getMovieLocations(imdbId):
-    url = "%s/locations" % getUrlBase(imdbId)
+    url = "%slocations" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     soup = BeautifulSoup(data)
     locations = []
@@ -295,7 +306,7 @@ def getMovieLocations(imdbId):
 def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
     photos = {}
     for key in keys:
-        url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
+        url = "%smediaindex?refine=%s" % (getUrlBase(imdbId), key)
         data = getUrlUnicode(url)
         photos[key] = {}
         for s in  re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
@@ -319,7 +330,7 @@ def getMoviePosters(imdbId):
     return posters
   
 def getMovieTrivia(imdbId):
-    url = "%s/trivia" % getUrlBase(imdbId)
+    url = "%strivia" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     soup = BeautifulSoup(data)
     trivia = []
@@ -334,7 +345,7 @@ def getMovieTrivia(imdbId):
     return trivia
 
 def getMovieConnections(imdbId):
-    url = "%s/movieconnections" % getUrlBase(imdbId)
+    url = "%smovieconnections" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     connections={}
     for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
@@ -342,7 +353,7 @@ def getMovieConnections(imdbId):
     return connections
 
 def getMovieKeywords(imdbId):
-    url = "%s/keywords" % getUrlBase(imdbId)
+    url = "%skeywords" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     keywords = []
     for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
@@ -352,7 +363,7 @@ def getMovieKeywords(imdbId):
     return keywords
 
 def getMovieExternalReviews(imdbId):
-    url = "%s/externalreviews" % getUrlBase(imdbId)
+    url = "%sexternalreviews" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     soup = BeautifulSoup(data)
     ol = soup('ol')
@@ -403,7 +414,7 @@ def _parseDate(d):
     return d
 
 def getMovieReleaseDates(imdbId):
-    url = "%s/releaseinfo" % getUrlBase(imdbId)
+    url = "%sreleaseinfo" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     releasedates = []
     regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
@@ -441,7 +452,7 @@ def getMovieFlimingDates(imdbId):
     return ''
 
 def getMovieBusiness(imdbId):
-    url = "%s/business" % getUrlBase(imdbId)
+    url = "%sbusiness" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     business = {}
     for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
@@ -451,7 +462,7 @@ def getMovieBusiness(imdbId):
     return business
 
 def getMovieEpisodes(imdbId):
-    url = "%s/episodes" % getUrlBase(imdbId)
+    url = "%sepisodes" % getUrlBase(imdbId)
     data = getUrlUnicode(url)
     episodes = {}
     regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
@@ -485,7 +496,7 @@ def getMovieEpisodes(imdbId):
 class IMDb:
     def __init__(self, imdbId):
         self.imdb = imdbId
-        self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
+        self.pageUrl = getUrlBase(imdbId)
 
     def getPage(self):
         return getUrlUnicode(self.pageUrl)