cleanup urls

This commit is contained in:
j 2009-06-01 15:11:22 +02:00
parent 10d3783673
commit bf3e51df23

View file

@ -22,6 +22,17 @@ import google
def getUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1): def getUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1):
return oxlib.cache.getUrlUnicode(url, data, headers, timeout) return oxlib.cache.getUrlUnicode(url, data, headers, timeout)
'''
check if result is valid while updating
def validate(result, header):
return header['status'] == u'200'
try:
d = oxlib.cache.getUrlUnicode(url, data, headers, timeout=0, valid=validate)
except oxlib.cache.InvalidResult, e:
print e.headers
'''
def getMovieId(title, director='', year=''): def getMovieId(title, director='', year=''):
''' '''
>>> getMovieId('The Matrix') >>> getMovieId('The Matrix')
@ -43,7 +54,7 @@ def getMovieData(imdbId):
# internal functions below # internal functions below
def getUrlBase(imdbId): def getUrlBase(imdbId):
return "http://www.imdb.com/title/tt%s" % imdbId return "http://www.imdb.com/title/tt%s/" % imdbId
def getRawMovieData(imdbId): def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId) imdbId = normalizeImdbId(imdbId)
@ -226,7 +237,7 @@ def creditList(data, section=None):
def getMovieCredits(imdbId): def getMovieCredits(imdbId):
credits = dict() credits = dict()
url = "%s/fullcredits" % getUrlBase(imdbId) url = "%sfullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
groups = data.split('<h5>') groups = data.split('<h5>')
for g in groups: for g in groups:
@ -236,7 +247,7 @@ def getMovieCredits(imdbId):
return credits return credits
def getMovieTrailers(imdbId): def getMovieTrailers(imdbId):
url = "%s/trailers" % getUrlBase(imdbId) url = "%strailers" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
videos = soup('div', {'class':"video-gallery"}) videos = soup('div', {'class':"video-gallery"})
@ -253,20 +264,20 @@ def getMovieTrailers(imdbId):
return trailers return trailers
def getMovieQuotes(imdbId): def getMovieQuotes(imdbId):
url = "%s/quotes" % getUrlBase(imdbId) url = "%squotes" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q')) quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
quotes = [(q[0].strip(),q[1].strip()) for q in quotes] quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
return quotes return quotes
def getMoviePlot(imdbId): def getMoviePlot(imdbId):
url = "%s/plotsummary" % getUrlBase(imdbId) url = "%splotsummary" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0] plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0]
return plot.strip() return plot.strip()
def getMovieTechnical(imdbId): def getMovieTechnical(imdbId):
url = "%s/technical" % getUrlBase(imdbId) url = "%stechnical" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
results = {} results = {}
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data): for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
@ -274,7 +285,7 @@ def getMovieTechnical(imdbId):
return results return results
def getMovieCompanyCredits(imdbId): def getMovieCompanyCredits(imdbId):
url = "%s/companycredits" % getUrlBase(imdbId) url = "%scompanycredits" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
results = {} results = {}
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data): for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
@ -284,7 +295,7 @@ def getMovieCompanyCredits(imdbId):
return results return results
def getMovieLocations(imdbId): def getMovieLocations(imdbId):
url = "%s/locations" % getUrlBase(imdbId) url = "%slocations" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
locations = [] locations = []
@ -295,7 +306,7 @@ def getMovieLocations(imdbId):
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')): def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
photos = {} photos = {}
for key in keys: for key in keys:
url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key) url = "%smediaindex?refine=%s" % (getUrlBase(imdbId), key)
data = getUrlUnicode(url) data = getUrlUnicode(url)
photos[key] = {} photos[key] = {}
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data): for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
@ -319,7 +330,7 @@ def getMoviePosters(imdbId):
return posters return posters
def getMovieTrivia(imdbId): def getMovieTrivia(imdbId):
url = "%s/trivia" % getUrlBase(imdbId) url = "%strivia" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
trivia = [] trivia = []
@ -334,7 +345,7 @@ def getMovieTrivia(imdbId):
return trivia return trivia
def getMovieConnections(imdbId): def getMovieConnections(imdbId):
url = "%s/movieconnections" % getUrlBase(imdbId) url = "%smovieconnections" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
connections={} connections={}
for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data): for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
@ -342,7 +353,7 @@ def getMovieConnections(imdbId):
return connections return connections
def getMovieKeywords(imdbId): def getMovieKeywords(imdbId):
url = "%s/keywords" % getUrlBase(imdbId) url = "%skeywords" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
keywords = [] keywords = []
for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data): for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
@ -352,7 +363,7 @@ def getMovieKeywords(imdbId):
return keywords return keywords
def getMovieExternalReviews(imdbId): def getMovieExternalReviews(imdbId):
url = "%s/externalreviews" % getUrlBase(imdbId) url = "%sexternalreviews" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
ol = soup('ol') ol = soup('ol')
@ -403,7 +414,7 @@ def _parseDate(d):
return d return d
def getMovieReleaseDates(imdbId): def getMovieReleaseDates(imdbId):
url = "%s/releaseinfo" % getUrlBase(imdbId) url = "%sreleaseinfo" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
releasedates = [] releasedates = []
regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>''' regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
@ -441,7 +452,7 @@ def getMovieFlimingDates(imdbId):
return '' return ''
def getMovieBusiness(imdbId): def getMovieBusiness(imdbId):
url = "%s/business" % getUrlBase(imdbId) url = "%sbusiness" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
business = {} business = {}
for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data): for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
@ -451,7 +462,7 @@ def getMovieBusiness(imdbId):
return business return business
def getMovieEpisodes(imdbId): def getMovieEpisodes(imdbId):
url = "%s/episodes" % getUrlBase(imdbId) url = "%sepisodes" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
episodes = {} episodes = {}
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>''' regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
@ -485,7 +496,7 @@ def getMovieEpisodes(imdbId):
class IMDb: class IMDb:
def __init__(self, imdbId): def __init__(self, imdbId):
self.imdb = imdbId self.imdb = imdbId
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.pageUrl = getUrlBase(imdbId)
def getPage(self): def getPage(self):
return getUrlUnicode(self.pageUrl) return getUrlUnicode(self.pageUrl)