From bf3e51df2381d13db3a149c73dd59a76b0c74d7c Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Mon, 1 Jun 2009 15:11:22 +0200 Subject: [PATCH] cleanup urls --- oxweb/imdb.py | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/oxweb/imdb.py b/oxweb/imdb.py index a642421..dcc30e0 100644 --- a/oxweb/imdb.py +++ b/oxweb/imdb.py @@ -22,6 +22,17 @@ import google def getUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1): return oxlib.cache.getUrlUnicode(url, data, headers, timeout) +''' +check if result is valid while updating +def validate(result, header): + return header['status'] == u'200' + +try: + d = oxlib.cache.getUrlUnicode(url, data, headers, timeout=0, valid=validate) +except oxlib.cache.InvalidResult, e: + print e.headers + +''' def getMovieId(title, director='', year=''): ''' >>> getMovieId('The Matrix') @@ -43,7 +54,7 @@ def getMovieData(imdbId): # internal functions below def getUrlBase(imdbId): - return "http://www.imdb.com/title/tt%s" % imdbId + return "http://www.imdb.com/title/tt%s/" % imdbId def getRawMovieData(imdbId): imdbId = normalizeImdbId(imdbId) @@ -226,7 +237,7 @@ def creditList(data, section=None): def getMovieCredits(imdbId): credits = dict() - url = "%s/fullcredits" % getUrlBase(imdbId) + url = "%sfullcredits" % getUrlBase(imdbId) data = getUrlUnicode(url) groups = data.split('
') for g in groups: @@ -236,7 +247,7 @@ def getMovieCredits(imdbId): return credits def getMovieTrailers(imdbId): - url = "%s/trailers" % getUrlBase(imdbId) + url = "%strailers" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) videos = soup('div', {'class':"video-gallery"}) @@ -253,20 +264,20 @@ def getMovieTrailers(imdbId): return trailers def getMovieQuotes(imdbId): - url = "%s/quotes" % getUrlBase(imdbId) + url = "%squotes" % getUrlBase(imdbId) data = getUrlUnicode(url) quotes = re.compile('(.*?):(.*?)
', re.DOTALL).findall(findString(data, '(.*?)').split('

')[0] return plot.strip() def getMovieTechnical(imdbId): - url = "%s/technical" % getUrlBase(imdbId) + url = "%stechnical" % getUrlBase(imdbId) data = getUrlUnicode(url) results = {} for t in re.compile('
(.*?)
(.*?)
', re.DOTALL).findall(data): @@ -274,7 +285,7 @@ def getMovieTechnical(imdbId): return results def getMovieCompanyCredits(imdbId): - url = "%s/companycredits" % getUrlBase(imdbId) + url = "%scompanycredits" % getUrlBase(imdbId) data = getUrlUnicode(url) results = {} for field, c in re.compile('

(.*?)

    (.*?)
').findall(data): @@ -284,7 +295,7 @@ def getMovieCompanyCredits(imdbId): return results def getMovieLocations(imdbId): - url = "%s/locations" % getUrlBase(imdbId) + url = "%slocations" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) locations = [] @@ -295,7 +306,7 @@ def getMovieLocations(imdbId): def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')): photos = {} for key in keys: - url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key) + url = "%smediaindex?refine=%s" % (getUrlBase(imdbId), key) data = getUrlUnicode(url) photos[key] = {} for s in re.compile('''(.*?)(.*?)
(.*?)\n\n''', re.DOTALL).findall(data): @@ -342,7 +353,7 @@ def getMovieConnections(imdbId): return connections def getMovieKeywords(imdbId): - url = "%s/keywords" % getUrlBase(imdbId) + url = "%skeywords" % getUrlBase(imdbId) data = getUrlUnicode(url) keywords = [] for keyword in re.compile('''(.*?)''').findall(data): @@ -352,7 +363,7 @@ def getMovieKeywords(imdbId): return keywords def getMovieExternalReviews(imdbId): - url = "%s/externalreviews" % getUrlBase(imdbId) + url = "%sexternalreviews" % getUrlBase(imdbId) data = getUrlUnicode(url) soup = BeautifulSoup(data) ol = soup('ol') @@ -403,7 +414,7 @@ def _parseDate(d): return d def getMovieReleaseDates(imdbId): - url = "%s/releaseinfo" % getUrlBase(imdbId) + url = "%sreleaseinfo" % getUrlBase(imdbId) data = getUrlUnicode(url) releasedates = [] regexp = '''(.*?).*?(.*?).*?(.*?)''' @@ -441,7 +452,7 @@ def getMovieFlimingDates(imdbId): return '' def getMovieBusiness(imdbId): - url = "%s/business" % getUrlBase(imdbId) + url = "%sbusiness" % getUrlBase(imdbId) data = getUrlUnicode(url) business = {} for r in re.compile('''
(.*?)
(.*?)
.
''', re.DOTALL).findall(data): @@ -451,7 +462,7 @@ def getMovieBusiness(imdbId): return business def getMovieEpisodes(imdbId): - url = "%s/episodes" % getUrlBase(imdbId) + url = "%sepisodes" % getUrlBase(imdbId) data = getUrlUnicode(url) episodes = {} regexp = r'''

Season (.*?), Episode (.*?): (.*?)

(.*?)
(.*?)
''' @@ -485,7 +496,7 @@ def getMovieEpisodes(imdbId): class IMDb: def __init__(self, imdbId): self.imdb = imdbId - self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb + self.pageUrl = getUrlBase(imdbId) def getPage(self): return getUrlUnicode(self.pageUrl)