diff --git a/ox/html.py b/ox/html.py index 8170918..a93a868 100644 --- a/ox/html.py +++ b/ox/html.py @@ -34,7 +34,7 @@ def escape(html): ''' if not isinstance(html, basestring): html = str(html) - return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') + return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') def linebreaks(value): ''' @@ -174,3 +174,105 @@ def highlight(text, query, hlClass="hl"): text = text.replace('|', '
') return text +def escape_html(value): + ''' + >>> escape_html(u'') + '<script>alert()</script>' + >>> parse_html('\'foo\' < \'bar\' && "foo" > "bar"') + '\'foo\' < \'bar\' && "foo" > "bar"' + >>> parse_html('foo') + 'foo' + >>> parse_html('foo') + 'foo' + ''' + if not tags: + tags = [ + # inline formatting + 'b', 'code', 'i', 's', 'sub', 'sup', 'u', + # block formatting + 'blockquote', 'h1', 'h2', 'h3', 'p', 'pre', + # lists + 'li', 'ol', 'ul', + # tables + 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', + # other + 'a', 'br', 'img', + # special + 'rtl', '[]' + ] + parse = { + 'a': { + ']*?href="((https?:\/\/|\/).+?)".*?>': '', + '<\/a>': '' + }, + 'img': { + ']*?src="((https?:\/\/|\/).+?)".*?>': '' + }, + 'rtl': { + '': '
', + '<\/rtl>': '
' + }, + '*': lambda tag: {'<(/?' + tag + ') ?/?>':'<{1}>'} + } + matches = [] + + #makes parse_html output the same value if run twice + html = decodeHtml(html) + + if '[]' in tags: + html = re.sub( + re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE), + '\\3', html); + tags = filter(lambda tag: tag != '[]', tags) + + def replace_match(match, value, replace): + i = 1 + for m in match.groups(): + value = value.replace('{%d}'%i, m) + i += 1 + matches.append(value) + return '\t%d\t' % len(matches) + + for tag in tags: + p = parse.get(tag, parse['*'](tag)) + for replace in p: + html = re.sub( + re.compile(replace, re.IGNORECASE), + lambda match: replace_match(match, p[replace][:], replace), + html + ) + html = escape(html) + for i in range(0, len(matches)): + html = html.replace('\t%d\t'%(i+1), matches[i]) + html = html.replace('\n\n', '

') + return sanitize_fragment(html) + +def sanitize_fragment(html): + import html5lib + return html5lib.parseFragment(html).toxml().decode('utf-8') +