getWikiData should not only work for en.wikipedia.org, newline can be before or after delimiter, use elif, parse google video

This commit is contained in:
j 2009-07-11 18:30:16 +02:00
parent af15fd01a0
commit 928e6a4769

View file

@ -37,8 +37,9 @@ def getUrlByAllmovieId(allmovieId):
return '' return ''
def getWikiData(wikipediaUrl): def getWikiData(wikipediaUrl):
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '') url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
url = "http://en.wikipedia.org/w/index.php?title=%s&action=raw" % title url = "%s&action=raw" % url
print url
data = getUrlUnicode(url) data = getUrlUnicode(url)
return data return data
@ -46,7 +47,10 @@ def getMovieData(wikipediaUrl):
data = getWikiData(wikipediaUrl) data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''') filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
filmbox = {} filmbox = {}
for row in filmbox_data.strip().split('\n|'): _box = filmbox_data.strip().split('\n|')
if len(_box) == 1:
_box = _box[0].split('|\n')
for row in _box:
d = row.split('=') d = row.split('=')
if len(d) == 2: if len(d) == 2:
key = d[0].strip() key = d[0].strip()
@ -56,7 +60,7 @@ def getMovieData(wikipediaUrl):
filmbox[key] = value filmbox[key] = value
if 'imdb title' in data: if 'imdb title' in data:
filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|') filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|')
else if 'imdb episode' in data: elif 'imdb episode' in data:
filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|') filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|')
if 'Amg movie' in data: if 'Amg movie' in data:
filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|') filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|')
@ -64,6 +68,8 @@ def getMovieData(wikipediaUrl):
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|') filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|')
if not filmbox['rottentomatoes_id']: if not filmbox['rottentomatoes_id']:
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|') filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|')
if 'google video' in data:
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|')
if 'DEFAULTSORT' in data: if 'DEFAULTSORT' in data:
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''') filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox return filmbox