some wikipedia parsing
This commit is contained in:
parent
dc0cea8262
commit
146ab0c0b5
1 changed files with 25 additions and 2 deletions
|
@ -4,8 +4,8 @@
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
|
|
||||||
import simplejson
|
import simplejson
|
||||||
from oxutils.cache import getUrl
|
from oxutils.cache import getUrl, getUrlUnicode
|
||||||
|
from oxutils import findRe, decodeHtml
|
||||||
|
|
||||||
def getMovieId(title, director='', year=''):
|
def getMovieId(title, director='', year=''):
|
||||||
query = '"%s" film %s %s' % (title, director, year)
|
query = '"%s" film %s %s' % (title, director, year)
|
||||||
|
@ -32,6 +32,29 @@ def getUrlByAmbId(amg_id):
|
||||||
return url
|
return url
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
def getWikiData(wikipediaUrl):
|
||||||
|
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
|
||||||
|
url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
|
||||||
|
html = getUrlUnicode(url)
|
||||||
|
data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
|
||||||
|
return data
|
||||||
|
|
||||||
|
def getMovieData(wikipediaUrl):
|
||||||
|
data = getWikiData(wikipediaUrl)
|
||||||
|
filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
|
||||||
|
filmbox = {}
|
||||||
|
for row in filmbox_data.strip().split('|'):
|
||||||
|
d = row.split('=')
|
||||||
|
if len(d) == 2:
|
||||||
|
key = d[0].strip()
|
||||||
|
value = d[1].strip()
|
||||||
|
filmbox[key] = value
|
||||||
|
return filmbox
|
||||||
|
|
||||||
|
def getAmbId(wikipediaUrl):
|
||||||
|
data = getMovieData(wikipediaUrl)
|
||||||
|
return data.get('amg_id', '')
|
||||||
|
|
||||||
def find(query, max_results=10):
|
def find(query, max_results=10):
|
||||||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||||
|
|
Loading…
Reference in a new issue