add oembed discovery, fix wikipedia parser
This commit is contained in:
parent
5c2e5e6a3a
commit
b506c13bb2
4 changed files with 47 additions and 1 deletions
|
@ -8,6 +8,7 @@ import js
|
||||||
import jsonc
|
import jsonc
|
||||||
import net
|
import net
|
||||||
import srt
|
import srt
|
||||||
|
import utils
|
||||||
|
|
||||||
from api import *
|
from api import *
|
||||||
from file import *
|
from file import *
|
||||||
|
@ -18,5 +19,6 @@ from image import *
|
||||||
from location import *
|
from location import *
|
||||||
from movie import *
|
from movie import *
|
||||||
from normalize import *
|
from normalize import *
|
||||||
|
from oembed import *
|
||||||
from text import *
|
from text import *
|
||||||
from torrent import *
|
from torrent import *
|
||||||
|
|
31
ox/oembed.py
Normal file
31
ox/oembed.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# ci:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
from text import findRe
|
||||||
|
import cache
|
||||||
|
from utils import json, ET
|
||||||
|
|
||||||
|
def get_embed_code(url, maxwidth=None, maxheight=None):
|
||||||
|
embed = {}
|
||||||
|
header = cache.getHeaders(url)
|
||||||
|
if header.get('content-type', '').startswith('text/html'):
|
||||||
|
html = cache.readUrl(url)
|
||||||
|
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||||
|
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||||
|
if json_oembed:
|
||||||
|
oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
|
||||||
|
if maxwidth:
|
||||||
|
oembed_url += '&maxwidth=%d' % maxwidth
|
||||||
|
if maxheight:
|
||||||
|
oembed_url += '&maxheight=%d' % maxheight
|
||||||
|
embed = json.loads(cache.readUrl(oembed_url))
|
||||||
|
elif xml_oembed:
|
||||||
|
oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
|
||||||
|
if maxwidth:
|
||||||
|
oembed_url += '&maxwidth=%d' % maxwidth
|
||||||
|
if maxheight:
|
||||||
|
oembed_url += '&maxheight=%d' % maxheight
|
||||||
|
data = cache.readUrl(oembed_url)
|
||||||
|
for e in ET.fromstring(data):
|
||||||
|
embed[e.tag] = e.text
|
||||||
|
return embed
|
|
@ -13,3 +13,7 @@ except ImportError:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from django.utils import simplejson as json
|
from django.utils import simplejson as json
|
||||||
|
|
||||||
|
try:
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
except:
|
||||||
|
import elementtree.ElementTree as ET
|
||||||
|
|
|
@ -72,9 +72,18 @@ def getMovieData(wikipediaUrl):
|
||||||
elif 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'):
|
elif 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'):
|
||||||
filmbox['amg_id'] = filmbox['amg_id'][2:]
|
filmbox['amg_id'] = filmbox['amg_id'][2:]
|
||||||
|
|
||||||
|
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
||||||
|
if r:
|
||||||
|
filmbox['imdb_id'] = r[0]
|
||||||
|
else:
|
||||||
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
|
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
|
||||||
if r:
|
if r:
|
||||||
filmbox['imdb_id'] = r[0]
|
filmbox['imdb_id'] = r[0]
|
||||||
|
|
||||||
|
r = re.compile('{{Internet Archive.*?\|id=(.*?)\|', re.IGNORECASE).findall(data)
|
||||||
|
if r:
|
||||||
|
filmbox['archiveorg_id'] = r[0]
|
||||||
|
|
||||||
r = re.compile('{{mojo title\|(.*?)\|', re.IGNORECASE).findall(data)
|
r = re.compile('{{mojo title\|(.*?)\|', re.IGNORECASE).findall(data)
|
||||||
if r:
|
if r:
|
||||||
filmbox['mojo_id'] = r[0]
|
filmbox['mojo_id'] = r[0]
|
||||||
|
|
Loading…
Reference in a new issue