getWikiData should not only work for en.wikipedia.org, newline can be before or after delimiter, use elif, parse google video
This commit is contained in:
parent
af15fd01a0
commit
928e6a4769
1 changed files with 10 additions and 4 deletions
|
@ -37,8 +37,9 @@ def getUrlByAllmovieId(allmovieId):
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getWikiData(wikipediaUrl):
|
def getWikiData(wikipediaUrl):
|
||||||
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
|
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
||||||
url = "http://en.wikipedia.org/w/index.php?title=%s&action=raw" % title
|
url = "%s&action=raw" % url
|
||||||
|
print url
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
@ -46,7 +47,10 @@ def getMovieData(wikipediaUrl):
|
||||||
data = getWikiData(wikipediaUrl)
|
data = getWikiData(wikipediaUrl)
|
||||||
filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
|
filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
|
||||||
filmbox = {}
|
filmbox = {}
|
||||||
for row in filmbox_data.strip().split('\n|'):
|
_box = filmbox_data.strip().split('\n|')
|
||||||
|
if len(_box) == 1:
|
||||||
|
_box = _box[0].split('|\n')
|
||||||
|
for row in _box:
|
||||||
d = row.split('=')
|
d = row.split('=')
|
||||||
if len(d) == 2:
|
if len(d) == 2:
|
||||||
key = d[0].strip()
|
key = d[0].strip()
|
||||||
|
@ -56,7 +60,7 @@ def getMovieData(wikipediaUrl):
|
||||||
filmbox[key] = value
|
filmbox[key] = value
|
||||||
if 'imdb title' in data:
|
if 'imdb title' in data:
|
||||||
filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|')
|
filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|')
|
||||||
else if 'imdb episode' in data:
|
elif 'imdb episode' in data:
|
||||||
filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|')
|
filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|')
|
||||||
if 'Amg movie' in data:
|
if 'Amg movie' in data:
|
||||||
filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|')
|
filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|')
|
||||||
|
@ -64,6 +68,8 @@ def getMovieData(wikipediaUrl):
|
||||||
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|')
|
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|')
|
||||||
if not filmbox['rottentomatoes_id']:
|
if not filmbox['rottentomatoes_id']:
|
||||||
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|')
|
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|')
|
||||||
|
if 'google video' in data:
|
||||||
|
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|')
|
||||||
if 'DEFAULTSORT' in data:
|
if 'DEFAULTSORT' in data:
|
||||||
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||||
return filmbox
|
return filmbox
|
||||||
|
|
Loading…
Reference in a new issue