getWikiData should not only work for en.wikipedia.org, newline can be before or after delimiter, use elif, parse google video
This commit is contained in:
parent
af15fd01a0
commit
928e6a4769
1 changed files with 10 additions and 4 deletions
|
@ -37,8 +37,9 @@ def getUrlByAllmovieId(allmovieId):
|
|||
return ''
|
||||
|
||||
def getWikiData(wikipediaUrl):
|
||||
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
|
||||
url = "http://en.wikipedia.org/w/index.php?title=%s&action=raw" % title
|
||||
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
||||
url = "%s&action=raw" % url
|
||||
print url
|
||||
data = getUrlUnicode(url)
|
||||
return data
|
||||
|
||||
|
@ -46,7 +47,10 @@ def getMovieData(wikipediaUrl):
|
|||
data = getWikiData(wikipediaUrl)
|
||||
filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
|
||||
filmbox = {}
|
||||
for row in filmbox_data.strip().split('\n|'):
|
||||
_box = filmbox_data.strip().split('\n|')
|
||||
if len(_box) == 1:
|
||||
_box = _box[0].split('|\n')
|
||||
for row in _box:
|
||||
d = row.split('=')
|
||||
if len(d) == 2:
|
||||
key = d[0].strip()
|
||||
|
@ -56,7 +60,7 @@ def getMovieData(wikipediaUrl):
|
|||
filmbox[key] = value
|
||||
if 'imdb title' in data:
|
||||
filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|')
|
||||
else if 'imdb episode' in data:
|
||||
elif 'imdb episode' in data:
|
||||
filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|')
|
||||
if 'Amg movie' in data:
|
||||
filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|')
|
||||
|
@ -64,6 +68,8 @@ def getMovieData(wikipediaUrl):
|
|||
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|')
|
||||
if not filmbox['rottentomatoes_id']:
|
||||
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|')
|
||||
if 'google video' in data:
|
||||
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|')
|
||||
if 'DEFAULTSORT' in data:
|
||||
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||
return filmbox
|
||||
|
|
Loading…
Reference in a new issue