rewrite sanitize_html to support global attributes
This commit is contained in:
parent
d8bb547e25
commit
d664d99f89
1 changed files with 153 additions and 74 deletions
225
ox/html.py
225
ox/html.py
|
@ -191,7 +191,7 @@ def escape_html(value):
|
||||||
'''
|
'''
|
||||||
return escape(decode_html(value))
|
return escape(decode_html(value))
|
||||||
|
|
||||||
def sanitize_html(html, tags=None, wikilinks=False):
|
def sanitize_html(html, tags=None, global_attributes=[]):
|
||||||
'''
|
'''
|
||||||
>>> sanitize_html('http://foo.com, bar')
|
>>> sanitize_html('http://foo.com, bar')
|
||||||
u'<a href="http://foo.com">http://foo.com</a>, bar'
|
u'<a href="http://foo.com">http://foo.com</a>, bar'
|
||||||
|
@ -209,7 +209,7 @@ def sanitize_html(html, tags=None, wikilinks=False):
|
||||||
u'<a href="javascript:alert()">foo'
|
u'<a href="javascript:alert()">foo'
|
||||||
>>> sanitize_html('[http://foo.com foo]')
|
>>> sanitize_html('[http://foo.com foo]')
|
||||||
u'<a href="http://foo.com">foo</a>'
|
u'<a href="http://foo.com">foo</a>'
|
||||||
>>> sanitize_html('<rtl>foo</rtl>')
|
>>> sanitize_html('<div style="direction: rtl">foo</div>')
|
||||||
u'<div style="direction: rtl">foo</div>'
|
u'<div style="direction: rtl">foo</div>'
|
||||||
>>> sanitize_html('<script>alert()</script>')
|
>>> sanitize_html('<script>alert()</script>')
|
||||||
u'<script>alert()</script>'
|
u'<script>alert()</script>'
|
||||||
|
@ -223,93 +223,172 @@ def sanitize_html(html, tags=None, wikilinks=False):
|
||||||
u"Anniversary of Daoud's Republic"
|
u"Anniversary of Daoud's Republic"
|
||||||
'''
|
'''
|
||||||
if not tags:
|
if not tags:
|
||||||
|
valid_url = '^((https?:\/\/|\/|mailto:).*?)'
|
||||||
tags = [
|
tags = [
|
||||||
# inline formatting
|
# inline formatting
|
||||||
'b', 'bdi', 'code', 'em', 'i', 'q', 's', 'span', 'strong', 'sub', 'sup', 'u',
|
{'name': 'b'},
|
||||||
|
{'name': 'bdi'},
|
||||||
|
{'name': 'code'},
|
||||||
|
{'name': 'em'},
|
||||||
|
{'name': 'i'},
|
||||||
|
{'name': 'q'},
|
||||||
|
{'name': 's'},
|
||||||
|
{'name': 'span'},
|
||||||
|
{'name': 'strong'},
|
||||||
|
{'name': 'sub'},
|
||||||
|
{'name': 'sup'},
|
||||||
|
{'name': 'u'},
|
||||||
# block formatting
|
# block formatting
|
||||||
'blockquote', 'cite', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre',
|
{'name': 'blockquote'},
|
||||||
# lists
|
{'name': 'cite'},
|
||||||
'li', 'ol', 'ul',
|
{
|
||||||
# tables
|
'name': 'div',
|
||||||
'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr',
|
'optional': ['style'],
|
||||||
# other
|
'validation': {
|
||||||
'a', 'br', 'img', 'figure', 'figcaption',
|
'style': '^direction: rtl$'
|
||||||
# iframe
|
|
||||||
'iframe',
|
|
||||||
# special
|
|
||||||
'rtl', '[]'
|
|
||||||
]
|
|
||||||
parse = {
|
|
||||||
'a': [
|
|
||||||
[
|
|
||||||
'<a [^<>]*?href="((https?:\/\/|\/|mailto:).+?)".*?>',
|
|
||||||
'<a href="{1}">'
|
|
||||||
],
|
|
||||||
['<\/a>', '</a>']
|
|
||||||
],
|
|
||||||
'img': [
|
|
||||||
[
|
|
||||||
'<img [^<>]*?src="((https?:\/\/|\/)[^"]+?)".*?>',
|
|
||||||
'<img src="{1}">'
|
|
||||||
]
|
|
||||||
],
|
|
||||||
'iframe': [
|
|
||||||
[
|
|
||||||
'<iframe [^<>]*?width="(\d+)" height="(\d+)"[^<>]*?src="((\/|https?:\/\/)[^"]+?)".*?>',
|
|
||||||
'<iframe width="{1}" height="{2}" src="{3}">'
|
|
||||||
],
|
|
||||||
[
|
|
||||||
'<iframe [^<>]*?src="((\/|https?:\/\/)[^"]+?)".*?>',
|
|
||||||
'<iframe src="{1}">'
|
|
||||||
],
|
|
||||||
[
|
|
||||||
'<\/iframe>',
|
|
||||||
'</iframe>'
|
|
||||||
]
|
|
||||||
],
|
|
||||||
'rtl': [
|
|
||||||
[
|
|
||||||
'<rtl>',
|
|
||||||
'<div style="direction: rtl">'
|
|
||||||
],
|
|
||||||
['<\/rtl>', '</div>']
|
|
||||||
],
|
|
||||||
'*': lambda tag: [['<(/?' + tag + ') ?/?>', '<{1}>']]
|
|
||||||
}
|
}
|
||||||
matches = []
|
},
|
||||||
|
{'name': 'h1'},
|
||||||
|
{'name': 'h2'},
|
||||||
|
{'name': 'h3'},
|
||||||
|
{'name': 'h4'},
|
||||||
|
{'name': 'h5'},
|
||||||
|
{'name': 'h6'},
|
||||||
|
{'name': 'p'},
|
||||||
|
{'name': 'pre'},
|
||||||
|
# lists
|
||||||
|
{'name': 'li'},
|
||||||
|
{'name': 'ol'},
|
||||||
|
{'name': 'ul'},
|
||||||
|
# tables
|
||||||
|
{'name': 'table'},
|
||||||
|
{'name': 'tbody'},
|
||||||
|
{'name': 'td'},
|
||||||
|
{'name': 'tfoot'},
|
||||||
|
{'name': 'th'},
|
||||||
|
{'name': 'thead'},
|
||||||
|
{'name': 'tr'},
|
||||||
|
# other
|
||||||
|
{'name': '[]'},
|
||||||
|
{
|
||||||
|
'name': 'a',
|
||||||
|
'required': ['href'],
|
||||||
|
'validation': {
|
||||||
|
'href': valid_url
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{'name': 'br'},
|
||||||
|
{
|
||||||
|
'name': 'iframe',
|
||||||
|
'optional': ['width', 'height'],
|
||||||
|
'required': ['src'],
|
||||||
|
'validation': {
|
||||||
|
'width': '^\d+$',
|
||||||
|
'height': '^\d+$',
|
||||||
|
'src': valid_url
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'img',
|
||||||
|
'optional': ['width', 'height'],
|
||||||
|
'required': ['src'],
|
||||||
|
'validation': {
|
||||||
|
'width': '^\d+$',
|
||||||
|
'height': '^\d+$',
|
||||||
|
'src': valid_url
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{'name': 'figure'},
|
||||||
|
{'name': 'figcaption'}
|
||||||
|
]
|
||||||
|
|
||||||
#makes parse_html output the same value if run twice
|
tag_re = re.compile('<(/)?([^\ /]+)(.*?)(/)?>')
|
||||||
html = decode_html(html)
|
attr_re = re.compile('([^=\ ]+)="([^"]+)"')
|
||||||
|
|
||||||
if '[]' in tags:
|
escaped = {}
|
||||||
|
level = 0
|
||||||
|
non_closing_tags = ['img', 'br']
|
||||||
|
required_attributes = {}
|
||||||
|
validation = {}
|
||||||
|
valid_attributes = {}
|
||||||
|
valid_tags = set([tag['name'] for tag in tags if tag['name'] != '[]'])
|
||||||
|
|
||||||
|
for tag in tags:
|
||||||
|
valid_attributes[tag['name']] = tag.get('required', []) \
|
||||||
|
+ tag.get('optional', []) \
|
||||||
|
+ global_attributes
|
||||||
|
required_attributes[tag['name']] = tag.get('required', [])
|
||||||
|
validation[tag['name']] = tag.get('validation', {})
|
||||||
|
|
||||||
|
if '[]' in validation:
|
||||||
html = re.sub(
|
html = re.sub(
|
||||||
re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
|
re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
|
||||||
'<a href="\\1">\\3</a>', html);
|
'<a href="\\1">\\3</a>', html);
|
||||||
tags = filter(lambda tag: tag != '[]', tags)
|
|
||||||
|
|
||||||
def replace_match(match, value, regexp):
|
parts = split_tags(html)
|
||||||
i = 1
|
for i, part in enumerate(parts):
|
||||||
for m in match.groups():
|
is_tag = i % 2
|
||||||
value = value.replace('{%d}'%i, m)
|
if is_tag:
|
||||||
i += 1
|
t = tag_re.findall(part)
|
||||||
matches.append(value)
|
if not t:
|
||||||
return '\t%d\t' % len(matches)
|
parts[i] = escape_html(decode_html(part))
|
||||||
|
continue
|
||||||
|
closing, name, attributes, end = t[0]
|
||||||
|
closing = closing != ''
|
||||||
|
a = attr_re.findall(attributes)
|
||||||
|
attrs = dict(a)
|
||||||
|
|
||||||
for tag in tags:
|
if not closing and not name in non_closing_tags:
|
||||||
p = parse.get(tag, parse['*'](tag))
|
level += 1
|
||||||
for regexp, value in p:
|
|
||||||
html = re.sub(
|
if not attrs and attributes or name not in valid_tags:
|
||||||
re.compile(regexp, re.IGNORECASE),
|
valid = False
|
||||||
lambda match: replace_match(match, value[:], regexp),
|
else:
|
||||||
html
|
valid = True
|
||||||
|
for key in set(attrs) - set(valid_attributes[name]):
|
||||||
|
del attrs[key]
|
||||||
|
for key in required_attributes[tag['name']]:
|
||||||
|
if not key in attrs:
|
||||||
|
valid = False
|
||||||
|
|
||||||
|
if valid:
|
||||||
|
for attr in attrs:
|
||||||
|
if attr in validation[name]:
|
||||||
|
if not re.compile(validation[name][attr]).findall(attrs[attr]):
|
||||||
|
valid = False
|
||||||
|
break
|
||||||
|
|
||||||
|
if valid and closing:
|
||||||
|
valid = not escaped.get(level)
|
||||||
|
else:
|
||||||
|
escaped[level] = not valid
|
||||||
|
if closing:
|
||||||
|
level -= 1
|
||||||
|
if valid:
|
||||||
|
parts[i] = '<%s%s%s>' % (
|
||||||
|
('/' if closing else ''),
|
||||||
|
name,
|
||||||
|
(' ' + ' '.join(['%s="%s"' % (key, attrs[key]) for key, value in a if key in attrs])
|
||||||
|
if not closing and attrs else '')
|
||||||
)
|
)
|
||||||
html = escape(html)
|
else:
|
||||||
for i in range(0, len(matches)):
|
parts[i] = escape_html(decode_html(part))
|
||||||
html = html.replace('\t%d\t'%(i+1), matches[i])
|
else:
|
||||||
|
parts[i] = escape_html(decode_html(part))
|
||||||
|
html = ''.join(parts)
|
||||||
html = html.replace('\n\n', '<br/><br/>')
|
html = html.replace('\n\n', '<br/><br/>')
|
||||||
html = add_links(html)
|
html = add_links(html)
|
||||||
return sanitize_fragment(html)
|
return sanitize_fragment(html)
|
||||||
|
|
||||||
|
def split_tags(string):
|
||||||
|
tags = []
|
||||||
|
def collect(match):
|
||||||
|
tags.append(match.group(0))
|
||||||
|
return '\0'
|
||||||
|
strings = re.sub('<[^<>]+>', collect, string).split('\0')
|
||||||
|
tags.append('')
|
||||||
|
return [item for sublist in zip(strings, tags) for item in sublist][:-1]
|
||||||
|
|
||||||
def sanitize_fragment(html):
|
def sanitize_fragment(html):
|
||||||
'''
|
'''
|
||||||
#html5lib reorders arguments, so not usable
|
#html5lib reorders arguments, so not usable
|
||||||
|
|
Loading…
Reference in a new issue