From 60839de99c69a5cab11d4dccff7a9fd0027a6167 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Tue, 21 Feb 2012 21:14:50 +0530
Subject: [PATCH] escape_html/parse_html
---
ox/html.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 103 insertions(+), 1 deletion(-)
diff --git a/ox/html.py b/ox/html.py
index 8170918..a93a868 100644
--- a/ox/html.py
+++ b/ox/html.py
@@ -34,7 +34,7 @@ def escape(html):
'''
if not isinstance(html, basestring):
html = str(html)
- return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
+ return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
def linebreaks(value):
'''
@@ -174,3 +174,105 @@ def highlight(text, query, hlClass="hl"):
text = text.replace('|', '
')
return text
+def escape_html(value):
+ '''
+ >>> escape_html(u'')
+ '<script>alert()</script>'
+ >>> parse_html('\'foo\' < \'bar\' && "foo" > "bar"')
+ '\'foo\' < \'bar\' && "foo" > "bar"'
+ >>> parse_html('foo')
+ 'foo'
+ >>> parse_html('foo')
+ 'foo'
+ '''
+ if not tags:
+ tags = [
+ # inline formatting
+ 'b', 'code', 'i', 's', 'sub', 'sup', 'u',
+ # block formatting
+ 'blockquote', 'h1', 'h2', 'h3', 'p', 'pre',
+ # lists
+ 'li', 'ol', 'ul',
+ # tables
+ 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr',
+ # other
+ 'a', 'br', 'img',
+ # special
+ 'rtl', '[]'
+ ]
+ parse = {
+ 'a': {
+ ']*?href="((https?:\/\/|\/).+?)".*?>': '',
+ '<\/a>': ''
+ },
+ 'img': {
+ ']*?src="((https?:\/\/|\/).+?)".*?>': ''
+ },
+ 'rtl': {
+ '': '',
+ '<\/rtl>': '
'
+ },
+ '*': lambda tag: {'<(/?' + tag + ') ?/?>':'<{1}>'}
+ }
+ matches = []
+
+ #makes parse_html output the same value if run twice
+ html = decodeHtml(html)
+
+ if '[]' in tags:
+ html = re.sub(
+ re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
+ '\\3', html);
+ tags = filter(lambda tag: tag != '[]', tags)
+
+ def replace_match(match, value, replace):
+ i = 1
+ for m in match.groups():
+ value = value.replace('{%d}'%i, m)
+ i += 1
+ matches.append(value)
+ return '\t%d\t' % len(matches)
+
+ for tag in tags:
+ p = parse.get(tag, parse['*'](tag))
+ for replace in p:
+ html = re.sub(
+ re.compile(replace, re.IGNORECASE),
+ lambda match: replace_match(match, p[replace][:], replace),
+ html
+ )
+ html = escape(html)
+ for i in range(0, len(matches)):
+ html = html.replace('\t%d\t'%(i+1), matches[i])
+ html = html.replace('\n\n', '
')
+ return sanitize_fragment(html)
+
+def sanitize_fragment(html):
+ import html5lib
+ return html5lib.parseFragment(html).toxml().decode('utf-8')
+