From b506c13bb27a4a4d45999929346a5bfb15e10500 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 10 Jan 2012 14:30:29 +0530 Subject: [PATCH] add oembed discovery, fix wikipedia parser --- ox/__init__.py | 2 ++ ox/oembed.py | 31 +++++++++++++++++++++++++++++++ ox/utils.py | 4 ++++ ox/web/wikipedia.py | 11 ++++++++++- 4 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 ox/oembed.py diff --git a/ox/__init__.py b/ox/__init__.py index f8cf598..54d7c45 100644 --- a/ox/__init__.py +++ b/ox/__init__.py @@ -8,6 +8,7 @@ import js import jsonc import net import srt +import utils from api import * from file import * @@ -18,5 +19,6 @@ from image import * from location import * from movie import * from normalize import * +from oembed import * from text import * from torrent import * diff --git a/ox/oembed.py b/ox/oembed.py new file mode 100644 index 0000000..822ff14 --- /dev/null +++ b/ox/oembed.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# ci:si:et:sw=4:sts=4:ts=4 +import re +from text import findRe +import cache +from utils import json, ET + +def get_embed_code(url, maxwidth=None, maxheight=None): + embed = {} + header = cache.getHeaders(url) + if header.get('content-type', '').startswith('text/html'): + html = cache.readUrl(url) + json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('').findall(html)) + xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('').findall(html)) + if json_oembed: + oembed_url = findRe(json_oembed[0], 'href="(.*?)"') + if maxwidth: + oembed_url += '&maxwidth=%d' % maxwidth + if maxheight: + oembed_url += '&maxheight=%d' % maxheight + embed = json.loads(cache.readUrl(oembed_url)) + elif xml_oembed: + oembed_url = findRe(json_oembed[0], 'href="(.*?)"') + if maxwidth: + oembed_url += '&maxwidth=%d' % maxwidth + if maxheight: + oembed_url += '&maxheight=%d' % maxheight + data = cache.readUrl(oembed_url) + for e in ET.fromstring(data): + embed[e.tag] = e.text + return embed diff --git a/ox/utils.py b/ox/utils.py index 047ade2..11c6300 100644 --- a/ox/utils.py +++ b/ox/utils.py @@ -13,3 +13,7 @@ except ImportError: except ImportError: from django.utils import simplejson as json +try: + import xml.etree.ElementTree as ET +except: + import elementtree.ElementTree as ET diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py index 27c57b1..e339e17 100644 --- a/ox/web/wikipedia.py +++ b/ox/web/wikipedia.py @@ -72,9 +72,18 @@ def getMovieData(wikipediaUrl): elif 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'): filmbox['amg_id'] = filmbox['amg_id'][2:] - r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data) + r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data) if r: filmbox['imdb_id'] = r[0] + else: + r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data) + if r: + filmbox['imdb_id'] = r[0] + + r = re.compile('{{Internet Archive.*?\|id=(.*?)\|', re.IGNORECASE).findall(data) + if r: + filmbox['archiveorg_id'] = r[0] + r = re.compile('{{mojo title\|(.*?)\|', re.IGNORECASE).findall(data) if r: filmbox['mojo_id'] = r[0]