From 146ab0c0b5063edf1116368fb822a7b824fa8ecb Mon Sep 17 00:00:00 2001 From: j Date: Sat, 10 May 2008 12:55:26 +0200 Subject: [PATCH] some wikipedia parsing --- ox/wikipedia.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/ox/wikipedia.py b/ox/wikipedia.py index 933c0d3..74ea475 100644 --- a/ox/wikipedia.py +++ b/ox/wikipedia.py @@ -4,8 +4,8 @@ from urllib import urlencode import simplejson -from oxutils.cache import getUrl - +from oxutils.cache import getUrl, getUrlUnicode +from oxutils import findRe, decodeHtml def getMovieId(title, director='', year=''): query = '"%s" film %s %s' % (title, director, year) @@ -32,6 +32,29 @@ def getUrlByAmbId(amg_id): return url return '' +def getWikiData(wikipediaUrl): + title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '') + url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title + html = getUrlUnicode(url) + data = decodeHtml(findRe(html, "(.*?)")) + return data + +def getMovieData(wikipediaUrl): + data = getWikiData(wikipediaUrl) + filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''') + filmbox = {} + for row in filmbox_data.strip().split('|'): + d = row.split('=') + if len(d) == 2: + key = d[0].strip() + value = d[1].strip() + filmbox[key] = value + return filmbox + +def getAmbId(wikipediaUrl): + data = getMovieData(wikipediaUrl) + return data.get('amg_id', '') + def find(query, max_results=10): query = {'action': 'query', 'list':'search', 'format': 'json', 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}