From b506c13bb27a4a4d45999929346a5bfb15e10500 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Tue, 10 Jan 2012 14:30:29 +0530
Subject: [PATCH] add oembed discovery, fix wikipedia parser

---
 ox/__init__.py      |  2 ++
 ox/oembed.py        | 31 +++++++++++++++++++++++++++++++
 ox/utils.py         |  4 ++++
 ox/web/wikipedia.py | 11 ++++++++++-
 4 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 ox/oembed.py

diff --git a/ox/__init__.py b/ox/__init__.py
index f8cf598..54d7c45 100644
--- a/ox/__init__.py
+++ b/ox/__init__.py
@@ -8,6 +8,7 @@ import js
 import jsonc
 import net
 import srt
+import utils
 
 from api import *
 from file import *
@@ -18,5 +19,6 @@ from image import *
 from location import *
 from movie import *
 from normalize import *
+from oembed import *
 from text import *
 from torrent import *
diff --git a/ox/oembed.py b/ox/oembed.py
new file mode 100644
index 0000000..822ff14
--- /dev/null
+++ b/ox/oembed.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+# ci:si:et:sw=4:sts=4:ts=4
+import re
+from text import findRe
+import cache
+from utils import json, ET
+
+def get_embed_code(url, maxwidth=None, maxheight=None):
+    embed = {}
+    header = cache.getHeaders(url)
+    if header.get('content-type', '').startswith('text/html'):
+        html = cache.readUrl(url)
+        json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
+        xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
+        if json_oembed:
+            oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
+            if maxwidth:
+                oembed_url += '&maxwidth=%d' % maxwidth
+            if maxheight:
+                oembed_url += '&maxheight=%d' % maxheight
+            embed = json.loads(cache.readUrl(oembed_url))
+        elif xml_oembed:
+            oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
+            if maxwidth:
+                oembed_url += '&maxwidth=%d' % maxwidth
+            if maxheight:
+                oembed_url += '&maxheight=%d' % maxheight
+            data = cache.readUrl(oembed_url)
+            for e in ET.fromstring(data):
+                embed[e.tag] = e.text
+    return embed
diff --git a/ox/utils.py b/ox/utils.py
index 047ade2..11c6300 100644
--- a/ox/utils.py
+++ b/ox/utils.py
@@ -13,3 +13,7 @@ except ImportError:
     except ImportError:
         from django.utils import simplejson as json
 
+try:
+    import xml.etree.ElementTree as ET
+except:
+    import elementtree.ElementTree as ET
diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py
index 27c57b1..e339e17 100644
--- a/ox/web/wikipedia.py
+++ b/ox/web/wikipedia.py
@@ -72,9 +72,18 @@ def getMovieData(wikipediaUrl):
     elif 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'):
         filmbox['amg_id'] = filmbox['amg_id'][2:]
 
-    r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
+    r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
     if r:
         filmbox['imdb_id'] = r[0]
+    else:
+        r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
+        if r:
+            filmbox['imdb_id'] = r[0]
+
+    r = re.compile('{{Internet Archive.*?\|id=(.*?)\|', re.IGNORECASE).findall(data)
+    if r:
+        filmbox['archiveorg_id'] = r[0]
+
     r = re.compile('{{mojo title\|(.*?)\|', re.IGNORECASE).findall(data)
     if r:
         filmbox['mojo_id'] = r[0]