some wikipedia parsing
This commit is contained in:
parent
dc0cea8262
commit
146ab0c0b5
1 changed files with 25 additions and 2 deletions
|
@ -4,8 +4,8 @@
|
|||
from urllib import urlencode
|
||||
|
||||
import simplejson
|
||||
from oxutils.cache import getUrl
|
||||
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
from oxutils import findRe, decodeHtml
|
||||
|
||||
def getMovieId(title, director='', year=''):
|
||||
query = '"%s" film %s %s' % (title, director, year)
|
||||
|
@ -32,6 +32,29 @@ def getUrlByAmbId(amg_id):
|
|||
return url
|
||||
return ''
|
||||
|
||||
def getWikiData(wikipediaUrl):
|
||||
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
|
||||
url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
|
||||
html = getUrlUnicode(url)
|
||||
data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
|
||||
return data
|
||||
|
||||
def getMovieData(wikipediaUrl):
|
||||
data = getWikiData(wikipediaUrl)
|
||||
filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
|
||||
filmbox = {}
|
||||
for row in filmbox_data.strip().split('|'):
|
||||
d = row.split('=')
|
||||
if len(d) == 2:
|
||||
key = d[0].strip()
|
||||
value = d[1].strip()
|
||||
filmbox[key] = value
|
||||
return filmbox
|
||||
|
||||
def getAmbId(wikipediaUrl):
|
||||
data = getMovieData(wikipediaUrl)
|
||||
return data.get('amg_id', '')
|
||||
|
||||
def find(query, max_results=10):
|
||||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||
|
|
Loading…
Reference in a new issue