less BeautifulSoup

This commit is contained in:
j 2009-08-06 12:10:57 +02:00
parent 93bd2e72e9
commit 54492f33a6
1 changed files with 26 additions and 38 deletions

View File

@ -187,6 +187,7 @@ def getMovieInfo(imdbId):
def getMovieRuntimeSeconds(imdbId): def getMovieRuntimeSeconds(imdbId):
info = getMovieInfo(imdbId) info = getMovieInfo(imdbId)
if 'runtime' in info:
value = info['runtime'][0] value = info['runtime'][0]
parsed_value = findRe(value, '(.*?) min') parsed_value = findRe(value, '(.*?) min')
parsed_value = findRe(parsed_value, '([0-9]+)') parsed_value = findRe(parsed_value, '([0-9]+)')
@ -199,6 +200,8 @@ def getMovieRuntimeSeconds(imdbId):
parsed_value = int(parsed_value) parsed_value = int(parsed_value)
else: else:
parsed_value = int(parsed_value) * 60 parsed_value = int(parsed_value) * 60
else:
parsed_value = -1
return parsed_value return parsed_value
def getMoviePoster(imdbId): def getMoviePoster(imdbId):
@ -325,10 +328,7 @@ def getMovieCompanyCredits(imdbId):
def getMovieLocations(imdbId): def getMovieLocations(imdbId):
url = "%slocations" % getUrlBase(imdbId) url = "%slocations" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
soup = BeautifulSoup(data) locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
locations.append(decodeHtml(key.string))
return locations return locations
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')): def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
@ -360,16 +360,14 @@ def getMoviePosters(imdbId):
def getMovieTrivia(imdbId): def getMovieTrivia(imdbId):
url = "%strivia" % getUrlBase(imdbId) url = "%strivia" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
soup = BeautifulSoup(data) trivia = re.compile('<li>(.*?)</li>').findall(data)
trivia = [] def clean(t):
triviaList = []
for i in soup('ul', {'class': "trivia"}):
for t in i('li'):
t = unicode(t).replace('<br />', '').strip()
if t.startswith('<li>') and t.endswith('</li>'):
t = t[4:-5].strip()
t = decodeHtml(t) t = decodeHtml(t)
trivia.append(t) t = t.replace(u'”', '"')
if t.endswith('<br><br>'):
t = t[:-8]
return t.strip()
trivia = [clean(t) for t in trivia]
return trivia return trivia
def getMovieConnections(imdbId): def getMovieConnections(imdbId):
@ -393,21 +391,11 @@ def getMovieKeywords(imdbId):
def getMovieExternalReviews(imdbId): def getMovieExternalReviews(imdbId):
url = "%sexternalreviews" % getUrlBase(imdbId) url = "%sexternalreviews" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = getUrlUnicode(url)
soup = BeautifulSoup(data) _reviews = re.compile('<li><a href="(.*?)">(.*?)</a></li>').findall(data)
ol = soup('ol') reviews = {}
if ol: for r in _reviews:
ol = ol[0] reviews[r[0]] = r[1]
ret = {} return reviews
for li in ol('li'):
try:
a = li('a')[0]
href = a.get('href')
txt = a.contents[0]
ret[href] = txt
except:
pass
return ret
return {}
def getMovieReleaseDate(imdbId): def getMovieReleaseDate(imdbId):
releasedates = getMovieReleaseDates(imdbId) releasedates = getMovieReleaseDates(imdbId)