add oembed discovery, fix wikipedia parser

2012-01-10 14:30:29 +05:30 · 2012-01-10 14:30:29 +05:30 · b506c13bb2
commit b506c13bb2
parent 5c2e5e6a3a
4 changed files with 47 additions and 1 deletions
--- a/ox/init.py
+++ b/ox/init.py
@ -8,6 +8,7 @@ import js
 import jsonc
 import net
 import srt
 import utils
 from api import *
 from file import *
@ -18,5 +19,6 @@ from image import *
 from location import *
 from movie import *
 from normalize import *
 from oembed import *
 from text import *
 from torrent import *
--- a/ox/oembed.py
+++ b/ox/oembed.py
@ -0,0 +1,31 @@
 # -*- coding: utf-8 -*-
 # ci:si:et:sw=4:sts=4:ts=4
 import re
 from text import findRe
 import cache
 from utils import json, ET
 def get_embed_code(url, maxwidth=None, maxheight=None):
    embed = {}
    header = cache.getHeaders(url)
    if header.get('content-type', '').startswith('text/html'):
        html = cache.readUrl(url)
        json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
        xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
        if json_oembed:
            oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
            if maxwidth:
                oembed_url += '&maxwidth=%d' % maxwidth
            if maxheight:
                oembed_url += '&maxheight=%d' % maxheight
            embed = json.loads(cache.readUrl(oembed_url))
        elif xml_oembed:
            oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
            if maxwidth:
                oembed_url += '&maxwidth=%d' % maxwidth
            if maxheight:
                oembed_url += '&maxheight=%d' % maxheight
            data = cache.readUrl(oembed_url)
            for e in ET.fromstring(data):
                embed[e.tag] = e.text
    return embed
--- a/ox/utils.py
+++ b/ox/utils.py
@ -13,3 +13,7 @@ except ImportError:
    except ImportError:
        from django.utils import simplejson as json
 try:
    import xml.etree.ElementTree as ET
 except:
    import elementtree.ElementTree as ET
--- a/ox/web/wikipedia.py
+++ b/ox/web/wikipedia.py
@ -72,9 +72,18 @@ def getMovieData(wikipediaUrl):
    elif 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'):
        filmbox['amg_id'] = filmbox['amg_id'][2:]
    r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
    if r:
        filmbox['imdb_id'] = r[0]
    else:
        r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
        if r:
            filmbox['imdb_id'] = r[0]
    r = re.compile('{{Internet Archive.*?\|id=(.*?)\|', re.IGNORECASE).findall(data)
    if r:
        filmbox['archiveorg_id'] = r[0]
    r = re.compile('{{mojo title\|(.*?)\|', re.IGNORECASE).findall(data)
    if r:
        filmbox['mojo_id'] = r[0]