some wikipedia parsing

This commit is contained in:
j 2008-05-10 12:55:26 +02:00
parent dc0cea8262
commit 146ab0c0b5

View file

@ -4,8 +4,8 @@
from urllib import urlencode from urllib import urlencode
import simplejson import simplejson
from oxutils.cache import getUrl from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRe, decodeHtml
def getMovieId(title, director='', year=''): def getMovieId(title, director='', year=''):
query = '"%s" film %s %s' % (title, director, year) query = '"%s" film %s %s' % (title, director, year)
@ -32,6 +32,29 @@ def getUrlByAmbId(amg_id):
return url return url
return '' return ''
def getWikiData(wikipediaUrl):
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
html = getUrlUnicode(url)
data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
return data
def getMovieData(wikipediaUrl):
data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
filmbox = {}
for row in filmbox_data.strip().split('|'):
d = row.split('=')
if len(d) == 2:
key = d[0].strip()
value = d[1].strip()
filmbox[key] = value
return filmbox
def getAmbId(wikipediaUrl):
data = getMovieData(wikipediaUrl)
return data.get('amg_id', '')
def find(query, max_results=10): def find(query, max_results=10):
query = {'action': 'query', 'list':'search', 'format': 'json', query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}