allow iframes in sanitize_html

This commit is contained in:
j 2013-10-24 16:40:04 +00:00
parent 38853b1f4b
commit 5dcd8b3552

View file

@ -234,22 +234,47 @@ def sanitize_html(html, tags=None, wikilinks=False):
'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr',
# other # other
'a', 'br', 'img', 'figure', 'figcaption', 'a', 'br', 'img', 'figure', 'figcaption',
# iframe
'iframe',
# special # special
'rtl', '[]' 'rtl', '[]'
] ]
parse = { parse = {
'a': { 'a': [
'<a [^<>]*?href="((https?:\/\/|\/|mailto:).+?)".*?>': '<a href="{1}">', [
'<\/a>': '</a>' '<a [^<>]*?href="((https?:\/\/|\/|mailto:).+?)".*?>',
}, '<a href="{1}">'
'img': { ],
'<img [^<>]*?src="((https?:\/\/|\/).+?)".*?>': '<img src="{1}">' ['<\/a>', '</a>']
}, ],
'rtl': { 'img': [
'<rtl>': '<div style="direction: rtl">', [
'<\/rtl>': '</div>' '<img [^<>]*?src="((https?:\/\/|\/)[^"]+?)".*?>',
}, '<img src="{1}">'
'*': lambda tag: {'<(/?' + tag + ') ?/?>':'<{1}>'} ]
],
'iframe': [
[
'<iframe [^<>]*?width="(\d+)" height="(\d+)"[^<>]*?src="((\/|https?:\/\/)[^"]+?)".*?>',
'<iframe width="{1}" height="{2}" src="{3}">'
],
[
'<iframe [^<>]*?src="((\/|https?:\/\/)[^"]+?)".*?>',
'<iframe src="{1}">'
],
[
'<\/iframe>',
'</iframe>'
]
],
'rtl': [
[
'<rtl>',
'<div style="direction: rtl">'
],
['<\/rtl>', '</div>']
],
'*': lambda tag: [['<(/?' + tag + ') ?/?>', '<{1}>']]
} }
matches = [] matches = []
@ -262,7 +287,7 @@ def sanitize_html(html, tags=None, wikilinks=False):
'<a href="\\1">\\3</a>', html); '<a href="\\1">\\3</a>', html);
tags = filter(lambda tag: tag != '[]', tags) tags = filter(lambda tag: tag != '[]', tags)
def replace_match(match, value, replace): def replace_match(match, value, regexp):
i = 1 i = 1
for m in match.groups(): for m in match.groups():
value = value.replace('{%d}'%i, m) value = value.replace('{%d}'%i, m)
@ -272,10 +297,10 @@ def sanitize_html(html, tags=None, wikilinks=False):
for tag in tags: for tag in tags:
p = parse.get(tag, parse['*'](tag)) p = parse.get(tag, parse['*'](tag))
for replace in p: for regexp, value in p:
html = re.sub( html = re.sub(
re.compile(replace, re.IGNORECASE), re.compile(regexp, re.IGNORECASE),
lambda match: replace_match(match, p[replace][:], replace), lambda match: replace_match(match, value[:], regexp),
html html
) )
html = escape(html) html = escape(html)
@ -286,6 +311,12 @@ def sanitize_html(html, tags=None, wikilinks=False):
return sanitize_fragment(html) return sanitize_fragment(html)
def sanitize_fragment(html): def sanitize_fragment(html):
'''
#html5lib reorders arguments, so not usable
import html5lib import html5lib
return html5lib.parseFragment(html).toxml().decode('utf-8') return html5lib.parseFragment(html).toxml().decode('utf-8')
'''
import lxml.html
body = lxml.html.document_fromstring(html).find('body')
return lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')