parse_html->sanitize_html, add links, allow mailto

This commit is contained in:
j 2012-05-27 13:38:58 +02:00
parent e556447d1b
commit f6b5d6bde8

View file

@ -164,7 +164,7 @@ def decodeHtml(html):
return match.group(0) return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ') return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
decode_hmtl = decodeHtml decode_html = decodeHtml
def highlight(text, query, hlClass="hl"): def highlight(text, query, hlClass="hl"):
""" """
@ -187,35 +187,37 @@ def escape_html(value):
>>> escape_html(u'<script> foo') >>> escape_html(u'<script> foo')
u'<script> foo' u'<script> foo'
''' '''
return escape(decodeHtml(value)) return escape(decode_html(value))
def parse_html(html, tags=None, wikilinks=False): def sanitize_html(html, tags=None, wikilinks=False):
''' '''
>>> parse_html('http://foo.com, bar') >>> sanitize_html('http://foo.com, bar')
'<a href="http://foo.com">http://foo.com</a>, bar' '<a href="http://foo.com">http://foo.com</a>, bar'
>>> parse_html('http://foo.com/foobar?foo, bar') >>> sanitize_html('http://foo.com/foobar?foo, bar')
'<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar' '<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
>>> parse_html('(see: www.foo.com)') >>> sanitize_html('(see: www.foo.com)')
'(see: <a href="http://www.foo.com">www.foo.com</a>)' '(see: <a href="http://www.foo.com">www.foo.com</a>)'
>>> parse_html('foo@bar.com') >>> sanitize_html('foo@bar.com')
'<a href="mailto:foo@bar.com">foo@bar.com</a>' '<a href="mailto:foo@bar.com">foo@bar.com</a>'
>>> parse_html('<a href="http://foo.com" onmouseover="alert()">foo</a>') >>> sanitize_html(sanitize_html('foo@bar.com'))
'<a href="mailto:foo@bar.com">foo@bar.com</a>'
>>> sanitize_html('<a href="http://foo.com" onmouseover="alert()">foo</a>')
'<a href="http://foo.com">foo</a>' '<a href="http://foo.com">foo</a>'
>>> parse_html('<a href="javascript:alert()">foo</a>') >>> sanitize_html('<a href="javascript:alert()">foo</a>')
'&lt;a href="javascript:alert()"&gt;foo' '&lt;a href="javascript:alert()"&gt;foo'
>>> parse_html('[http://foo.com foo]') >>> sanitize_html('[http://foo.com foo]')
'<a href="http://foo.com">foo</a>' '<a href="http://foo.com">foo</a>'
>>> parse_html('<rtl>foo</rtl>') >>> sanitize_html('<rtl>foo</rtl>')
'<div style="direction: rtl">foo</div>' '<div style="direction: rtl">foo</div>'
>>> parse_html('<script>alert()</script>') >>> sanitize_html('<script>alert()</script>')
'&lt;script&gt;alert()&lt;/script&gt;' '&lt;script&gt;alert()&lt;/script&gt;'
>>> parse_html('\'foo\' < \'bar\' && "foo" > "bar"') >>> sanitize_html('\'foo\' < \'bar\' && "foo" > "bar"')
'\'foo\' &lt; \'bar\' &amp;&amp; "foo" &gt; "bar"' '\'foo\' &lt; \'bar\' &amp;&amp; "foo" &gt; "bar"'
>>> parse_html('<b>foo') >>> sanitize_html('<b>foo')
'<b>foo</b>' '<b>foo</b>'
>>> parse_html('<b>foo</b></b>') >>> sanitize_html('<b>foo</b></b>')
'<b>foo</b>' '<b>foo</b>'
>>> parse_html('Anniversary of Daoud&apos;s Republic') >>> sanitize_html('Anniversary of Daoud&apos;s Republic')
'Anniversary of Daoud&apos;s Republic' 'Anniversary of Daoud&apos;s Republic'
''' '''
if not tags: if not tags:
@ -235,7 +237,7 @@ def parse_html(html, tags=None, wikilinks=False):
] ]
parse = { parse = {
'a': { 'a': {
'<a [^<>]*?href="((https?:\/\/|\/).+?)".*?>': '<a href="{1}">', '<a [^<>]*?href="((https?:\/\/|\/|mailto:).+?)".*?>': '<a href="{1}">',
'<\/a>': '</a>' '<\/a>': '</a>'
}, },
'img': { 'img': {
@ -250,7 +252,7 @@ def parse_html(html, tags=None, wikilinks=False):
matches = [] matches = []
#makes parse_html output the same value if run twice #makes parse_html output the same value if run twice
html = decodeHtml(html) html = decode_html(html)
if '[]' in tags: if '[]' in tags:
html = re.sub( html = re.sub(
@ -278,6 +280,7 @@ def parse_html(html, tags=None, wikilinks=False):
for i in range(0, len(matches)): for i in range(0, len(matches)):
html = html.replace('\t%d\t'%(i+1), matches[i]) html = html.replace('\t%d\t'%(i+1), matches[i])
html = html.replace('\n\n', '<br/><br/>') html = html.replace('\n\n', '<br/><br/>')
html = urlize(html)
return sanitize_fragment(html) return sanitize_fragment(html)
def sanitize_fragment(html): def sanitize_fragment(html):