some wikipedia parsing

2008-05-10 12:55:26 +02:00 · 2008-05-10 12:55:26 +02:00 · 146ab0c0b5
commit 146ab0c0b5
parent dc0cea8262
1 changed files with 25 additions and 2 deletions
--- a/ox/wikipedia.py
+++ b/ox/wikipedia.py
@ -4,8 +4,8 @@
 from urllib import urlencode

 import simplejson
-from oxutils.cache import getUrl
-
+from oxutils.cache import getUrl, getUrlUnicode
+from oxutils import findRe, decodeHtml

 def getMovieId(title, director='', year=''):
  query = '"%s" film %s %s' % (title, director, year)
@ -32,6 +32,29 @@ def getUrlByAmbId(amg_id):
    return url
  return ''

+def getWikiData(wikipediaUrl):
+  title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
+  url =   "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
+  html = getUrlUnicode(url)
+  data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
+  return data
+
+def getMovieData(wikipediaUrl):
+  data = getWikiData(wikipediaUrl)
+  filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
+  filmbox = {}
+  for row in filmbox_data.strip().split('|'): 
+   d = row.split('=')
+   if len(d) == 2:
+     key = d[0].strip()
+     value = d[1].strip()
+     filmbox[key] = value
+  return filmbox
+
+def getAmbId(wikipediaUrl):
+  data = getMovieData(wikipediaUrl)
+  return data.get('amg_id', '')
+
 def find(query, max_results=10):
  query = {'action': 'query', 'list':'search', 'format': 'json',
           'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}