From 928e6a4769386a603c6cf8a3f02d4af589e59eb3 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sat, 11 Jul 2009 18:30:16 +0200 Subject: [PATCH] getWikiData should not only work for en.wikipedia.org, newline can be before or after delimiter, use elif, parse google video --- oxweb/wikipedia.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/oxweb/wikipedia.py b/oxweb/wikipedia.py index 905abed..8c74efe 100644 --- a/oxweb/wikipedia.py +++ b/oxweb/wikipedia.py @@ -37,8 +37,9 @@ def getUrlByAllmovieId(allmovieId): return '' def getWikiData(wikipediaUrl): - title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '') - url = "http://en.wikipedia.org/w/index.php?title=%s&action=raw" % title + url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=') + url = "%s&action=raw" % url + print url data = getUrlUnicode(url) return data @@ -46,7 +47,10 @@ def getMovieData(wikipediaUrl): data = getWikiData(wikipediaUrl) filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''') filmbox = {} - for row in filmbox_data.strip().split('\n|'): + _box = filmbox_data.strip().split('\n|') + if len(_box) == 1: + _box = _box[0].split('|\n') + for row in _box: d = row.split('=') if len(d) == 2: key = d[0].strip() @@ -56,7 +60,7 @@ def getMovieData(wikipediaUrl): filmbox[key] = value if 'imdb title' in data: filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|') - else if 'imdb episode' in data: + elif 'imdb episode' in data: filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|') if 'Amg movie' in data: filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|') @@ -64,6 +68,8 @@ def getMovieData(wikipediaUrl): filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|') if not filmbox['rottentomatoes_id']: filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|') + if 'google video' in data: + filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|') if 'DEFAULTSORT' in data: filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''') return filmbox