From d664d99f898814c9f533d19ba1cbd558c1635921 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Sun, 10 Nov 2013 22:00:24 +0000
Subject: [PATCH] rewrite sanitize_html to support global attributes
---
ox/html.py | 227 ++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 153 insertions(+), 74 deletions(-)
diff --git a/ox/html.py b/ox/html.py
index 8f30609..6b17212 100644
--- a/ox/html.py
+++ b/ox/html.py
@@ -191,7 +191,7 @@ def escape_html(value):
'''
return escape(decode_html(value))
-def sanitize_html(html, tags=None, wikilinks=False):
+def sanitize_html(html, tags=None, global_attributes=[]):
'''
>>> sanitize_html('http://foo.com, bar')
u'http://foo.com, bar'
@@ -209,7 +209,7 @@ def sanitize_html(html, tags=None, wikilinks=False):
u'<a href="javascript:alert()">foo'
>>> sanitize_html('[http://foo.com foo]')
u'foo'
- >>> sanitize_html('foo')
+ >>> sanitize_html('
foo
')
u'foo
'
>>> sanitize_html('')
u'<script>alert()</script>'
@@ -223,93 +223,172 @@ def sanitize_html(html, tags=None, wikilinks=False):
u"Anniversary of Daoud's Republic"
'''
if not tags:
+ valid_url = '^((https?:\/\/|\/|mailto:).*?)'
tags = [
# inline formatting
- 'b', 'bdi', 'code', 'em', 'i', 'q', 's', 'span', 'strong', 'sub', 'sup', 'u',
+ {'name': 'b'},
+ {'name': 'bdi'},
+ {'name': 'code'},
+ {'name': 'em'},
+ {'name': 'i'},
+ {'name': 'q'},
+ {'name': 's'},
+ {'name': 'span'},
+ {'name': 'strong'},
+ {'name': 'sub'},
+ {'name': 'sup'},
+ {'name': 'u'},
# block formatting
- 'blockquote', 'cite', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre',
+ {'name': 'blockquote'},
+ {'name': 'cite'},
+ {
+ 'name': 'div',
+ 'optional': ['style'],
+ 'validation': {
+ 'style': '^direction: rtl$'
+ }
+ },
+ {'name': 'h1'},
+ {'name': 'h2'},
+ {'name': 'h3'},
+ {'name': 'h4'},
+ {'name': 'h5'},
+ {'name': 'h6'},
+ {'name': 'p'},
+ {'name': 'pre'},
# lists
- 'li', 'ol', 'ul',
+ {'name': 'li'},
+ {'name': 'ol'},
+ {'name': 'ul'},
# tables
- 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr',
+ {'name': 'table'},
+ {'name': 'tbody'},
+ {'name': 'td'},
+ {'name': 'tfoot'},
+ {'name': 'th'},
+ {'name': 'thead'},
+ {'name': 'tr'},
# other
- 'a', 'br', 'img', 'figure', 'figcaption',
- # iframe
- 'iframe',
- # special
- 'rtl', '[]'
+ {'name': '[]'},
+ {
+ 'name': 'a',
+ 'required': ['href'],
+ 'validation': {
+ 'href': valid_url
+ }
+ },
+ {'name': 'br'},
+ {
+ 'name': 'iframe',
+ 'optional': ['width', 'height'],
+ 'required': ['src'],
+ 'validation': {
+ 'width': '^\d+$',
+ 'height': '^\d+$',
+ 'src': valid_url
+ }
+ },
+ {
+ 'name': 'img',
+ 'optional': ['width', 'height'],
+ 'required': ['src'],
+ 'validation': {
+ 'width': '^\d+$',
+ 'height': '^\d+$',
+ 'src': valid_url
+ },
+ },
+ {'name': 'figure'},
+ {'name': 'figcaption'}
]
- parse = {
- 'a': [
- [
- ']*?href="((https?:\/\/|\/|mailto:).+?)".*?>',
- ''
- ],
- ['<\/a>', '']
- ],
- 'img': [
- [
- ']*?src="((https?:\/\/|\/)[^"]+?)".*?>',
- ''
- ]
- ],
- 'iframe': [
- [
- '