getWikiData should not only work for en.wikipedia.org, newline can be before or after delimiter, use elif, parse google video

2009-07-11 18:30:16 +02:00 · 2009-07-11 18:30:16 +02:00 · 928e6a4769
commit 928e6a4769
parent af15fd01a0
1 changed files with 10 additions and 4 deletions
--- a/oxweb/wikipedia.py
+++ b/oxweb/wikipedia.py
@ -37,8 +37,9 @@ def getUrlByAllmovieId(allmovieId):
    return ''

 def getWikiData(wikipediaUrl):
-    title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
-    url =   "http://en.wikipedia.org/w/index.php?title=%s&action=raw" % title
+    url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
+    url = "%s&action=raw" % url
+    print url
    data = getUrlUnicode(url)
    return data

@ -46,7 +47,10 @@ def getMovieData(wikipediaUrl):
    data = getWikiData(wikipediaUrl)
    filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
    filmbox = {}
-    for row in filmbox_data.strip().split('\n|'): 
+    _box = filmbox_data.strip().split('\n|')
+    if len(_box) == 1:
+        _box = _box[0].split('|\n')
+    for row in _box:
        d = row.split('=')
        if len(d) == 2:
            key = d[0].strip()
@ -56,7 +60,7 @@ def getMovieData(wikipediaUrl):
            filmbox[key] = value
    if 'imdb title' in data:
        filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|')
-    else if 'imdb episode' in data:
+    elif 'imdb episode' in data:
        filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|')
    if 'Amg movie' in data:
        filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|')
@ -64,6 +68,8 @@ def getMovieData(wikipediaUrl):
        filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|')
        if not filmbox['rottentomatoes_id']:
            filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|')
+    if 'google video' in data:
+        filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|')
    if 'DEFAULTSORT' in data:
        filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
    return filmbox