From 146ab0c0b5063edf1116368fb822a7b824fa8ecb Mon Sep 17 00:00:00 2001
From: j <j@0xdb.org>
Date: Sat, 10 May 2008 12:55:26 +0200
Subject: [PATCH] some wikipedia parsing

---
 ox/wikipedia.py | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)
diff --git a/ox/wikipedia.py b/ox/wikipedia.py
index 933c0d3..74ea475 100644
--- a/ox/wikipedia.py
+++ b/ox/wikipedia.py
@@ -4,8 +4,8 @@
 from urllib import urlencode
 
 import simplejson
-from oxutils.cache import getUrl
-
+from oxutils.cache import getUrl, getUrlUnicode
+from oxutils import findRe, decodeHtml
 
 def getMovieId(title, director='', year=''):
   query = '"%s" film %s %s' % (title, director, year)
@@ -32,6 +32,29 @@ def getUrlByAmbId(amg_id):
     return url
   return ''
 
+def getWikiData(wikipediaUrl):
+  title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
+  url =   "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
+  html = getUrlUnicode(url)
+  data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
+  return data
+
+def getMovieData(wikipediaUrl):
+  data = getWikiData(wikipediaUrl)
+  filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
+  filmbox = {}
+  for row in filmbox_data.strip().split('|'): 
+   d = row.split('=')
+   if len(d) == 2:
+     key = d[0].strip()
+     value = d[1].strip()
+     filmbox[key] = value
+  return filmbox
+
+def getAmbId(wikipediaUrl):
+  data = getMovieData(wikipediaUrl)
+  return data.get('amg_id', '')
+
 def find(query, max_results=10):
   query = {'action': 'query', 'list':'search', 'format': 'json',
            'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}