getWikiData should not only work for en.wikipedia.org, newline can be before or after delimiter, use elif, parse google video

This commit is contained in:
j 2009-07-11 18:30:16 +02:00
parent af15fd01a0
commit 928e6a4769

View file

@ -37,8 +37,9 @@ def getUrlByAllmovieId(allmovieId):
return ''
def getWikiData(wikipediaUrl):
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
url = "http://en.wikipedia.org/w/index.php?title=%s&action=raw" % title
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
url = "%s&action=raw" % url
print url
data = getUrlUnicode(url)
return data
@ -46,7 +47,10 @@ def getMovieData(wikipediaUrl):
data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
filmbox = {}
for row in filmbox_data.strip().split('\n|'):
_box = filmbox_data.strip().split('\n|')
if len(_box) == 1:
_box = _box[0].split('|\n')
for row in _box:
d = row.split('=')
if len(d) == 2:
key = d[0].strip()
@ -56,7 +60,7 @@ def getMovieData(wikipediaUrl):
filmbox[key] = value
if 'imdb title' in data:
filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|')
else if 'imdb episode' in data:
elif 'imdb episode' in data:
filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|')
if 'Amg movie' in data:
filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|')
@ -64,6 +68,8 @@ def getMovieData(wikipediaUrl):
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|')
if not filmbox['rottentomatoes_id']:
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|')
if 'google video' in data:
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|')
if 'DEFAULTSORT' in data:
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox