less BeautifulSoup

2009-08-06 12:10:57 +02:00 · 2009-08-06 12:10:57 +02:00 · 54492f33a6
commit 54492f33a6
parent 93bd2e72e9
1 changed files with 26 additions and 38 deletions
--- a/oxweb/imdb.py
+++ b/oxweb/imdb.py
@ -187,18 +187,21 @@ def getMovieInfo(imdbId):
 def getMovieRuntimeSeconds(imdbId):
    info = getMovieInfo(imdbId)
-    value = info['runtime'][0]
+    if 'runtime' in info:
-    parsed_value = findRe(value, '(.*?) min')
+        value = info['runtime'][0]
-    parsed_value = findRe(parsed_value, '([0-9]+)')
+        parsed_value = findRe(value, '(.*?) min')
    if not parsed_value:
        parsed_value = findRe(value, '(.*?) sec')
        parsed_value = findRe(parsed_value, '([0-9]+)')
        if not parsed_value:
-            parsed_value = 0
+            parsed_value = findRe(value, '(.*?) sec')
            parsed_value = findRe(parsed_value, '([0-9]+)')
            if not parsed_value:
                parsed_value = 0
            else:
                parsed_value = int(parsed_value)
        else:
-            parsed_value = int(parsed_value)
+            parsed_value = int(parsed_value) * 60
    else:
-        parsed_value = int(parsed_value) * 60
+        parsed_value = -1
    return parsed_value
 def getMoviePoster(imdbId):
@ -325,10 +328,7 @@ def getMovieCompanyCredits(imdbId):
 def getMovieLocations(imdbId):
    url = "%slocations" % getUrlBase(imdbId)
    data = getUrlUnicode(url)
-    soup = BeautifulSoup(data)
+    locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
    locations = []
    for key in soup('a', {'href': re.compile('^/List')}):
        locations.append(decodeHtml(key.string))
    return locations
 def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
@ -360,17 +360,15 @@ def getMoviePosters(imdbId):
 def getMovieTrivia(imdbId):
    url = "%strivia" % getUrlBase(imdbId)
    data = getUrlUnicode(url)
-    soup = BeautifulSoup(data)
+    trivia = re.compile('<li>(.*?)</li>').findall(data)
-    trivia = []
+    def clean(t):
-    triviaList = []
+        t = decodeHtml(t)
-    for i in  soup('ul', {'class': "trivia"}):
+        t = t.replace(u'', '"')
-        for t in i('li'):
+        if t.endswith('<br><br>'):
-            t = unicode(t).replace('<br />', '').strip()
+            t = t[:-8]
-            if t.startswith('<li>') and t.endswith('</li>'):
+        return t.strip()
-                t = t[4:-5].strip()
+    trivia = [clean(t) for t in trivia]
-            t=decodeHtml(t)
+    return trivia 
            trivia.append(t)
    return trivia
 def getMovieConnections(imdbId):
    url = "%smovieconnections" % getUrlBase(imdbId)
@ -393,21 +391,11 @@ def getMovieKeywords(imdbId):
 def getMovieExternalReviews(imdbId):
    url = "%sexternalreviews" % getUrlBase(imdbId)
    data = getUrlUnicode(url)
-    soup = BeautifulSoup(data)
+    _reviews = re.compile('<li><a href="(.*?)">(.*?)</a></li>').findall(data)
-    ol = soup('ol')
+    reviews = {}
-    if ol:
+    for r in _reviews:
-        ol = ol[0]
+        reviews[r[0]] = r[1]
-        ret = {}
+    return reviews
        for li in ol('li'):
            try:
                a = li('a')[0]
                href = a.get('href')
                txt = a.contents[0]
                ret[href] = txt
            except:
                pass
        return ret
    return {}
 def getMovieReleaseDate(imdbId):
    releasedates = getMovieReleaseDates(imdbId)