cleanup urls
This commit is contained in:
parent
10d3783673
commit
bf3e51df23
1 changed files with 28 additions and 17 deletions
|
@ -22,6 +22,17 @@ import google
|
||||||
def getUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1):
|
def getUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1):
|
||||||
return oxlib.cache.getUrlUnicode(url, data, headers, timeout)
|
return oxlib.cache.getUrlUnicode(url, data, headers, timeout)
|
||||||
|
|
||||||
|
'''
|
||||||
|
check if result is valid while updating
|
||||||
|
def validate(result, header):
|
||||||
|
return header['status'] == u'200'
|
||||||
|
|
||||||
|
try:
|
||||||
|
d = oxlib.cache.getUrlUnicode(url, data, headers, timeout=0, valid=validate)
|
||||||
|
except oxlib.cache.InvalidResult, e:
|
||||||
|
print e.headers
|
||||||
|
|
||||||
|
'''
|
||||||
def getMovieId(title, director='', year=''):
|
def getMovieId(title, director='', year=''):
|
||||||
'''
|
'''
|
||||||
>>> getMovieId('The Matrix')
|
>>> getMovieId('The Matrix')
|
||||||
|
@ -43,7 +54,7 @@ def getMovieData(imdbId):
|
||||||
|
|
||||||
# internal functions below
|
# internal functions below
|
||||||
def getUrlBase(imdbId):
|
def getUrlBase(imdbId):
|
||||||
return "http://www.imdb.com/title/tt%s" % imdbId
|
return "http://www.imdb.com/title/tt%s/" % imdbId
|
||||||
|
|
||||||
def getRawMovieData(imdbId):
|
def getRawMovieData(imdbId):
|
||||||
imdbId = normalizeImdbId(imdbId)
|
imdbId = normalizeImdbId(imdbId)
|
||||||
|
@ -226,7 +237,7 @@ def creditList(data, section=None):
|
||||||
|
|
||||||
def getMovieCredits(imdbId):
|
def getMovieCredits(imdbId):
|
||||||
credits = dict()
|
credits = dict()
|
||||||
url = "%s/fullcredits" % getUrlBase(imdbId)
|
url = "%sfullcredits" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
groups = data.split('<h5>')
|
groups = data.split('<h5>')
|
||||||
for g in groups:
|
for g in groups:
|
||||||
|
@ -236,7 +247,7 @@ def getMovieCredits(imdbId):
|
||||||
return credits
|
return credits
|
||||||
|
|
||||||
def getMovieTrailers(imdbId):
|
def getMovieTrailers(imdbId):
|
||||||
url = "%s/trailers" % getUrlBase(imdbId)
|
url = "%strailers" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
soup = BeautifulSoup(data)
|
soup = BeautifulSoup(data)
|
||||||
videos = soup('div', {'class':"video-gallery"})
|
videos = soup('div', {'class':"video-gallery"})
|
||||||
|
@ -253,20 +264,20 @@ def getMovieTrailers(imdbId):
|
||||||
return trailers
|
return trailers
|
||||||
|
|
||||||
def getMovieQuotes(imdbId):
|
def getMovieQuotes(imdbId):
|
||||||
url = "%s/quotes" % getUrlBase(imdbId)
|
url = "%squotes" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
|
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
|
||||||
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
|
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
|
||||||
return quotes
|
return quotes
|
||||||
|
|
||||||
def getMoviePlot(imdbId):
|
def getMoviePlot(imdbId):
|
||||||
url = "%s/plotsummary" % getUrlBase(imdbId)
|
url = "%splotsummary" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0]
|
plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0]
|
||||||
return plot.strip()
|
return plot.strip()
|
||||||
|
|
||||||
def getMovieTechnical(imdbId):
|
def getMovieTechnical(imdbId):
|
||||||
url = "%s/technical" % getUrlBase(imdbId)
|
url = "%stechnical" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
results = {}
|
results = {}
|
||||||
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
|
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
|
||||||
|
@ -274,7 +285,7 @@ def getMovieTechnical(imdbId):
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def getMovieCompanyCredits(imdbId):
|
def getMovieCompanyCredits(imdbId):
|
||||||
url = "%s/companycredits" % getUrlBase(imdbId)
|
url = "%scompanycredits" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
results = {}
|
results = {}
|
||||||
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
|
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
|
||||||
|
@ -284,7 +295,7 @@ def getMovieCompanyCredits(imdbId):
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def getMovieLocations(imdbId):
|
def getMovieLocations(imdbId):
|
||||||
url = "%s/locations" % getUrlBase(imdbId)
|
url = "%slocations" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
soup = BeautifulSoup(data)
|
soup = BeautifulSoup(data)
|
||||||
locations = []
|
locations = []
|
||||||
|
@ -295,7 +306,7 @@ def getMovieLocations(imdbId):
|
||||||
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
|
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
|
||||||
photos = {}
|
photos = {}
|
||||||
for key in keys:
|
for key in keys:
|
||||||
url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
|
url = "%smediaindex?refine=%s" % (getUrlBase(imdbId), key)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
photos[key] = {}
|
photos[key] = {}
|
||||||
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
|
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
|
||||||
|
@ -319,7 +330,7 @@ def getMoviePosters(imdbId):
|
||||||
return posters
|
return posters
|
||||||
|
|
||||||
def getMovieTrivia(imdbId):
|
def getMovieTrivia(imdbId):
|
||||||
url = "%s/trivia" % getUrlBase(imdbId)
|
url = "%strivia" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
soup = BeautifulSoup(data)
|
soup = BeautifulSoup(data)
|
||||||
trivia = []
|
trivia = []
|
||||||
|
@ -334,7 +345,7 @@ def getMovieTrivia(imdbId):
|
||||||
return trivia
|
return trivia
|
||||||
|
|
||||||
def getMovieConnections(imdbId):
|
def getMovieConnections(imdbId):
|
||||||
url = "%s/movieconnections" % getUrlBase(imdbId)
|
url = "%smovieconnections" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
connections={}
|
connections={}
|
||||||
for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
|
for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
|
||||||
|
@ -342,7 +353,7 @@ def getMovieConnections(imdbId):
|
||||||
return connections
|
return connections
|
||||||
|
|
||||||
def getMovieKeywords(imdbId):
|
def getMovieKeywords(imdbId):
|
||||||
url = "%s/keywords" % getUrlBase(imdbId)
|
url = "%skeywords" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
keywords = []
|
keywords = []
|
||||||
for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
|
for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
|
||||||
|
@ -352,7 +363,7 @@ def getMovieKeywords(imdbId):
|
||||||
return keywords
|
return keywords
|
||||||
|
|
||||||
def getMovieExternalReviews(imdbId):
|
def getMovieExternalReviews(imdbId):
|
||||||
url = "%s/externalreviews" % getUrlBase(imdbId)
|
url = "%sexternalreviews" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
soup = BeautifulSoup(data)
|
soup = BeautifulSoup(data)
|
||||||
ol = soup('ol')
|
ol = soup('ol')
|
||||||
|
@ -403,7 +414,7 @@ def _parseDate(d):
|
||||||
return d
|
return d
|
||||||
|
|
||||||
def getMovieReleaseDates(imdbId):
|
def getMovieReleaseDates(imdbId):
|
||||||
url = "%s/releaseinfo" % getUrlBase(imdbId)
|
url = "%sreleaseinfo" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
releasedates = []
|
releasedates = []
|
||||||
regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
|
regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
|
||||||
|
@ -441,7 +452,7 @@ def getMovieFlimingDates(imdbId):
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getMovieBusiness(imdbId):
|
def getMovieBusiness(imdbId):
|
||||||
url = "%s/business" % getUrlBase(imdbId)
|
url = "%sbusiness" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
business = {}
|
business = {}
|
||||||
for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
|
for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
|
||||||
|
@ -451,7 +462,7 @@ def getMovieBusiness(imdbId):
|
||||||
return business
|
return business
|
||||||
|
|
||||||
def getMovieEpisodes(imdbId):
|
def getMovieEpisodes(imdbId):
|
||||||
url = "%s/episodes" % getUrlBase(imdbId)
|
url = "%sepisodes" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
episodes = {}
|
episodes = {}
|
||||||
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
|
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
|
||||||
|
@ -485,7 +496,7 @@ def getMovieEpisodes(imdbId):
|
||||||
class IMDb:
|
class IMDb:
|
||||||
def __init__(self, imdbId):
|
def __init__(self, imdbId):
|
||||||
self.imdb = imdbId
|
self.imdb = imdbId
|
||||||
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
self.pageUrl = getUrlBase(imdbId)
|
||||||
|
|
||||||
def getPage(self):
|
def getPage(self):
|
||||||
return getUrlUnicode(self.pageUrl)
|
return getUrlUnicode(self.pageUrl)
|
||||||
|
|
Loading…
Reference in a new issue