less BeautifulSoup

2009-08-06 12:10:57 +02:00 · 2009-08-06 12:10:57 +02:00 · 54492f33a6
commit 54492f33a6
parent 93bd2e72e9
1 changed files with 26 additions and 38 deletions
--- a/oxweb/imdb.py
+++ b/oxweb/imdb.py
@ -187,6 +187,7 @@ def getMovieInfo(imdbId):
 def getMovieRuntimeSeconds(imdbId):
    info = getMovieInfo(imdbId)
    if 'runtime' in info:
        value = info['runtime'][0]
        parsed_value = findRe(value, '(.*?) min')
        parsed_value = findRe(parsed_value, '([0-9]+)')
@ -199,6 +200,8 @@ def getMovieRuntimeSeconds(imdbId):
                parsed_value = int(parsed_value)
        else:
            parsed_value = int(parsed_value) * 60
    else:
        parsed_value = -1
    return parsed_value
 def getMoviePoster(imdbId):
@ -325,10 +328,7 @@ def getMovieCompanyCredits(imdbId):
 def getMovieLocations(imdbId):
    url = "%slocations" % getUrlBase(imdbId)
    data = getUrlUnicode(url)
-    soup = BeautifulSoup(data)
+    locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
    locations = []
    for key in soup('a', {'href': re.compile('^/List')}):
        locations.append(decodeHtml(key.string))
    return locations
 def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
@ -360,16 +360,14 @@ def getMoviePosters(imdbId):
 def getMovieTrivia(imdbId):
    url = "%strivia" % getUrlBase(imdbId)
    data = getUrlUnicode(url)
-    soup = BeautifulSoup(data)
+    trivia = re.compile('<li>(.*?)</li>').findall(data)
-    trivia = []
+    def clean(t):
    triviaList = []
    for i in  soup('ul', {'class': "trivia"}):
        for t in i('li'):
            t = unicode(t).replace('<br />', '').strip()
            if t.startswith('<li>') and t.endswith('</li>'):
                t = t[4:-5].strip()
        t = decodeHtml(t)
-            trivia.append(t)
+        t = t.replace(u'', '"')
        if t.endswith('<br><br>'):
            t = t[:-8]
        return t.strip()
    trivia = [clean(t) for t in trivia]
    return trivia 
 def getMovieConnections(imdbId):
@ -393,21 +391,11 @@ def getMovieKeywords(imdbId):
 def getMovieExternalReviews(imdbId):
    url = "%sexternalreviews" % getUrlBase(imdbId)
    data = getUrlUnicode(url)
-    soup = BeautifulSoup(data)
+    _reviews = re.compile('<li><a href="(.*?)">(.*?)</a></li>').findall(data)
-    ol = soup('ol')
+    reviews = {}
-    if ol:
+    for r in _reviews:
-        ol = ol[0]
+        reviews[r[0]] = r[1]
-        ret = {}
+    return reviews
        for li in ol('li'):
            try:
                a = li('a')[0]
                href = a.get('href')
                txt = a.contents[0]
                ret[href] = txt
            except:
                pass
        return ret
    return {}
 def getMovieReleaseDate(imdbId):
    releasedates = getMovieReleaseDates(imdbId)