allow iframes in sanitize_html
This commit is contained in:
parent
38853b1f4b
commit
5dcd8b3552
1 changed files with 48 additions and 17 deletions
63
ox/html.py
63
ox/html.py
|
@ -234,22 +234,47 @@ def sanitize_html(html, tags=None, wikilinks=False):
|
||||||
'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr',
|
'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr',
|
||||||
# other
|
# other
|
||||||
'a', 'br', 'img', 'figure', 'figcaption',
|
'a', 'br', 'img', 'figure', 'figcaption',
|
||||||
|
# iframe
|
||||||
|
'iframe',
|
||||||
# special
|
# special
|
||||||
'rtl', '[]'
|
'rtl', '[]'
|
||||||
]
|
]
|
||||||
parse = {
|
parse = {
|
||||||
'a': {
|
'a': [
|
||||||
'<a [^<>]*?href="((https?:\/\/|\/|mailto:).+?)".*?>': '<a href="{1}">',
|
[
|
||||||
'<\/a>': '</a>'
|
'<a [^<>]*?href="((https?:\/\/|\/|mailto:).+?)".*?>',
|
||||||
},
|
'<a href="{1}">'
|
||||||
'img': {
|
],
|
||||||
'<img [^<>]*?src="((https?:\/\/|\/).+?)".*?>': '<img src="{1}">'
|
['<\/a>', '</a>']
|
||||||
},
|
],
|
||||||
'rtl': {
|
'img': [
|
||||||
'<rtl>': '<div style="direction: rtl">',
|
[
|
||||||
'<\/rtl>': '</div>'
|
'<img [^<>]*?src="((https?:\/\/|\/)[^"]+?)".*?>',
|
||||||
},
|
'<img src="{1}">'
|
||||||
'*': lambda tag: {'<(/?' + tag + ') ?/?>':'<{1}>'}
|
]
|
||||||
|
],
|
||||||
|
'iframe': [
|
||||||
|
[
|
||||||
|
'<iframe [^<>]*?width="(\d+)" height="(\d+)"[^<>]*?src="((\/|https?:\/\/)[^"]+?)".*?>',
|
||||||
|
'<iframe width="{1}" height="{2}" src="{3}">'
|
||||||
|
],
|
||||||
|
[
|
||||||
|
'<iframe [^<>]*?src="((\/|https?:\/\/)[^"]+?)".*?>',
|
||||||
|
'<iframe src="{1}">'
|
||||||
|
],
|
||||||
|
[
|
||||||
|
'<\/iframe>',
|
||||||
|
'</iframe>'
|
||||||
|
]
|
||||||
|
],
|
||||||
|
'rtl': [
|
||||||
|
[
|
||||||
|
'<rtl>',
|
||||||
|
'<div style="direction: rtl">'
|
||||||
|
],
|
||||||
|
['<\/rtl>', '</div>']
|
||||||
|
],
|
||||||
|
'*': lambda tag: [['<(/?' + tag + ') ?/?>', '<{1}>']]
|
||||||
}
|
}
|
||||||
matches = []
|
matches = []
|
||||||
|
|
||||||
|
@ -262,7 +287,7 @@ def sanitize_html(html, tags=None, wikilinks=False):
|
||||||
'<a href="\\1">\\3</a>', html);
|
'<a href="\\1">\\3</a>', html);
|
||||||
tags = filter(lambda tag: tag != '[]', tags)
|
tags = filter(lambda tag: tag != '[]', tags)
|
||||||
|
|
||||||
def replace_match(match, value, replace):
|
def replace_match(match, value, regexp):
|
||||||
i = 1
|
i = 1
|
||||||
for m in match.groups():
|
for m in match.groups():
|
||||||
value = value.replace('{%d}'%i, m)
|
value = value.replace('{%d}'%i, m)
|
||||||
|
@ -272,10 +297,10 @@ def sanitize_html(html, tags=None, wikilinks=False):
|
||||||
|
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
p = parse.get(tag, parse['*'](tag))
|
p = parse.get(tag, parse['*'](tag))
|
||||||
for replace in p:
|
for regexp, value in p:
|
||||||
html = re.sub(
|
html = re.sub(
|
||||||
re.compile(replace, re.IGNORECASE),
|
re.compile(regexp, re.IGNORECASE),
|
||||||
lambda match: replace_match(match, p[replace][:], replace),
|
lambda match: replace_match(match, value[:], regexp),
|
||||||
html
|
html
|
||||||
)
|
)
|
||||||
html = escape(html)
|
html = escape(html)
|
||||||
|
@ -286,6 +311,12 @@ def sanitize_html(html, tags=None, wikilinks=False):
|
||||||
return sanitize_fragment(html)
|
return sanitize_fragment(html)
|
||||||
|
|
||||||
def sanitize_fragment(html):
|
def sanitize_fragment(html):
|
||||||
|
'''
|
||||||
|
#html5lib reorders arguments, so not usable
|
||||||
import html5lib
|
import html5lib
|
||||||
return html5lib.parseFragment(html).toxml().decode('utf-8')
|
return html5lib.parseFragment(html).toxml().decode('utf-8')
|
||||||
|
'''
|
||||||
|
import lxml.html
|
||||||
|
body = lxml.html.document_fromstring(html).find('body')
|
||||||
|
return lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue