less BeautifulSoup

2009-08-06 12:10:57 +02:00 · 2009-08-06 12:10:57 +02:00 · 54492f33a6
commit 54492f33a6
parent 93bd2e72e9
1 changed files with 26 additions and 38 deletions
--- a/oxweb/imdb.py
+++ b/oxweb/imdb.py
@ -187,18 +187,21 @@ def getMovieInfo(imdbId):

 def getMovieRuntimeSeconds(imdbId):
    info = getMovieInfo(imdbId)
-    value = info['runtime'][0]
-    parsed_value = findRe(value, '(.*?) min')
-    parsed_value = findRe(parsed_value, '([0-9]+)')
-    if not parsed_value:
-        parsed_value = findRe(value, '(.*?) sec')
+    if 'runtime' in info:
+        value = info['runtime'][0]
+        parsed_value = findRe(value, '(.*?) min')
        parsed_value = findRe(parsed_value, '([0-9]+)')
        if not parsed_value:
-            parsed_value = 0
+            parsed_value = findRe(value, '(.*?) sec')
+            parsed_value = findRe(parsed_value, '([0-9]+)')
+            if not parsed_value:
+                parsed_value = 0
+            else:
+                parsed_value = int(parsed_value)
        else:
-            parsed_value = int(parsed_value)
+            parsed_value = int(parsed_value) * 60
    else:
-        parsed_value = int(parsed_value) * 60
+        parsed_value = -1
    return parsed_value

 def getMoviePoster(imdbId):
@ -325,10 +328,7 @@ def getMovieCompanyCredits(imdbId):
 def getMovieLocations(imdbId):
    url = "%slocations" % getUrlBase(imdbId)
    data = getUrlUnicode(url)
-    soup = BeautifulSoup(data)
-    locations = []
-    for key in soup('a', {'href': re.compile('^/List')}):
-        locations.append(decodeHtml(key.string))
+    locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
    return locations

 def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
@ -360,17 +360,15 @@ def getMoviePosters(imdbId):
 def getMovieTrivia(imdbId):
    url = "%strivia" % getUrlBase(imdbId)
    data = getUrlUnicode(url)
-    soup = BeautifulSoup(data)
-    trivia = []
-    triviaList = []
-    for i in  soup('ul', {'class': "trivia"}):
-        for t in i('li'):
-            t = unicode(t).replace('<br />', '').strip()
-            if t.startswith('<li>') and t.endswith('</li>'):
-                t = t[4:-5].strip()
-            t=decodeHtml(t)
-            trivia.append(t)
-    return trivia
+    trivia = re.compile('<li>(.*?)</li>').findall(data)
+    def clean(t):
+        t = decodeHtml(t)
+        t = t.replace(u'', '"')
+        if t.endswith('<br><br>'):
+            t = t[:-8]
+        return t.strip()
+    trivia = [clean(t) for t in trivia]
+    return trivia 

 def getMovieConnections(imdbId):
    url = "%smovieconnections" % getUrlBase(imdbId)
@ -393,21 +391,11 @@ def getMovieKeywords(imdbId):
 def getMovieExternalReviews(imdbId):
    url = "%sexternalreviews" % getUrlBase(imdbId)
    data = getUrlUnicode(url)
-    soup = BeautifulSoup(data)
-    ol = soup('ol')
-    if ol:
-        ol = ol[0]
-        ret = {}
-        for li in ol('li'):
-            try:
-                a = li('a')[0]
-                href = a.get('href')
-                txt = a.contents[0]
-                ret[href] = txt
-            except:
-                pass
-        return ret
-    return {}
+    _reviews = re.compile('<li><a href="(.*?)">(.*?)</a></li>').findall(data)
+    reviews = {}
+    for r in _reviews:
+        reviews[r[0]] = r[1]
+    return reviews

 def getMovieReleaseDate(imdbId):
    releasedates = getMovieReleaseDates(imdbId)