add oembed discovery, fix wikipedia parser

This commit is contained in:
j 2012-01-10 14:30:29 +05:30
parent 5c2e5e6a3a
commit b506c13bb2
4 changed files with 47 additions and 1 deletions

View file

@ -8,6 +8,7 @@ import js
import jsonc import jsonc
import net import net
import srt import srt
import utils
from api import * from api import *
from file import * from file import *
@ -18,5 +19,6 @@ from image import *
from location import * from location import *
from movie import * from movie import *
from normalize import * from normalize import *
from oembed import *
from text import * from text import *
from torrent import * from torrent import *

31
ox/oembed.py Normal file
View file

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
# ci:si:et:sw=4:sts=4:ts=4
import re
from text import findRe
import cache
from utils import json, ET
def get_embed_code(url, maxwidth=None, maxheight=None):
embed = {}
header = cache.getHeaders(url)
if header.get('content-type', '').startswith('text/html'):
html = cache.readUrl(url)
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
if json_oembed:
oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
if maxwidth:
oembed_url += '&maxwidth=%d' % maxwidth
if maxheight:
oembed_url += '&maxheight=%d' % maxheight
embed = json.loads(cache.readUrl(oembed_url))
elif xml_oembed:
oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
if maxwidth:
oembed_url += '&maxwidth=%d' % maxwidth
if maxheight:
oembed_url += '&maxheight=%d' % maxheight
data = cache.readUrl(oembed_url)
for e in ET.fromstring(data):
embed[e.tag] = e.text
return embed

View file

@ -13,3 +13,7 @@ except ImportError:
except ImportError: except ImportError:
from django.utils import simplejson as json from django.utils import simplejson as json
try:
import xml.etree.ElementTree as ET
except:
import elementtree.ElementTree as ET

View file

@ -72,9 +72,18 @@ def getMovieData(wikipediaUrl):
elif 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'): elif 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'):
filmbox['amg_id'] = filmbox['amg_id'][2:] filmbox['amg_id'] = filmbox['amg_id'][2:]
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data) r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
if r: if r:
filmbox['imdb_id'] = r[0] filmbox['imdb_id'] = r[0]
else:
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
if r:
filmbox['imdb_id'] = r[0]
r = re.compile('{{Internet Archive.*?\|id=(.*?)\|', re.IGNORECASE).findall(data)
if r:
filmbox['archiveorg_id'] = r[0]
r = re.compile('{{mojo title\|(.*?)\|', re.IGNORECASE).findall(data) r = re.compile('{{mojo title\|(.*?)\|', re.IGNORECASE).findall(data)
if r: if r:
filmbox['mojo_id'] = r[0] filmbox['mojo_id'] = r[0]