parse_html->sanitize_html, add links, allow mailto
This commit is contained in:
parent
e556447d1b
commit
f6b5d6bde8
1 changed files with 22 additions and 19 deletions
39
ox/html.py
39
ox/html.py
|
@ -164,7 +164,7 @@ def decodeHtml(html):
|
|||
return match.group(0)
|
||||
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
||||
|
||||
decode_hmtl = decodeHtml
|
||||
decode_html = decodeHtml
|
||||
|
||||
def highlight(text, query, hlClass="hl"):
|
||||
"""
|
||||
|
@ -187,35 +187,37 @@ def escape_html(value):
|
|||
>>> escape_html(u'<script> foo')
|
||||
u'<script> foo'
|
||||
'''
|
||||
return escape(decodeHtml(value))
|
||||
return escape(decode_html(value))
|
||||
|
||||
def parse_html(html, tags=None, wikilinks=False):
|
||||
def sanitize_html(html, tags=None, wikilinks=False):
|
||||
'''
|
||||
>>> parse_html('http://foo.com, bar')
|
||||
>>> sanitize_html('http://foo.com, bar')
|
||||
'<a href="http://foo.com">http://foo.com</a>, bar'
|
||||
>>> parse_html('http://foo.com/foobar?foo, bar')
|
||||
>>> sanitize_html('http://foo.com/foobar?foo, bar')
|
||||
'<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
|
||||
>>> parse_html('(see: www.foo.com)')
|
||||
>>> sanitize_html('(see: www.foo.com)')
|
||||
'(see: <a href="http://www.foo.com">www.foo.com</a>)'
|
||||
>>> parse_html('foo@bar.com')
|
||||
>>> sanitize_html('foo@bar.com')
|
||||
'<a href="mailto:foo@bar.com">foo@bar.com</a>'
|
||||
>>> parse_html('<a href="http://foo.com" onmouseover="alert()">foo</a>')
|
||||
>>> sanitize_html(sanitize_html('foo@bar.com'))
|
||||
'<a href="mailto:foo@bar.com">foo@bar.com</a>'
|
||||
>>> sanitize_html('<a href="http://foo.com" onmouseover="alert()">foo</a>')
|
||||
'<a href="http://foo.com">foo</a>'
|
||||
>>> parse_html('<a href="javascript:alert()">foo</a>')
|
||||
>>> sanitize_html('<a href="javascript:alert()">foo</a>')
|
||||
'<a href="javascript:alert()">foo'
|
||||
>>> parse_html('[http://foo.com foo]')
|
||||
>>> sanitize_html('[http://foo.com foo]')
|
||||
'<a href="http://foo.com">foo</a>'
|
||||
>>> parse_html('<rtl>foo</rtl>')
|
||||
>>> sanitize_html('<rtl>foo</rtl>')
|
||||
'<div style="direction: rtl">foo</div>'
|
||||
>>> parse_html('<script>alert()</script>')
|
||||
>>> sanitize_html('<script>alert()</script>')
|
||||
'<script>alert()</script>'
|
||||
>>> parse_html('\'foo\' < \'bar\' && "foo" > "bar"')
|
||||
>>> sanitize_html('\'foo\' < \'bar\' && "foo" > "bar"')
|
||||
'\'foo\' < \'bar\' && "foo" > "bar"'
|
||||
>>> parse_html('<b>foo')
|
||||
>>> sanitize_html('<b>foo')
|
||||
'<b>foo</b>'
|
||||
>>> parse_html('<b>foo</b></b>')
|
||||
>>> sanitize_html('<b>foo</b></b>')
|
||||
'<b>foo</b>'
|
||||
>>> parse_html('Anniversary of Daoud's Republic')
|
||||
>>> sanitize_html('Anniversary of Daoud's Republic')
|
||||
'Anniversary of Daoud's Republic'
|
||||
'''
|
||||
if not tags:
|
||||
|
@ -235,7 +237,7 @@ def parse_html(html, tags=None, wikilinks=False):
|
|||
]
|
||||
parse = {
|
||||
'a': {
|
||||
'<a [^<>]*?href="((https?:\/\/|\/).+?)".*?>': '<a href="{1}">',
|
||||
'<a [^<>]*?href="((https?:\/\/|\/|mailto:).+?)".*?>': '<a href="{1}">',
|
||||
'<\/a>': '</a>'
|
||||
},
|
||||
'img': {
|
||||
|
@ -250,7 +252,7 @@ def parse_html(html, tags=None, wikilinks=False):
|
|||
matches = []
|
||||
|
||||
#makes parse_html output the same value if run twice
|
||||
html = decodeHtml(html)
|
||||
html = decode_html(html)
|
||||
|
||||
if '[]' in tags:
|
||||
html = re.sub(
|
||||
|
@ -278,6 +280,7 @@ def parse_html(html, tags=None, wikilinks=False):
|
|||
for i in range(0, len(matches)):
|
||||
html = html.replace('\t%d\t'%(i+1), matches[i])
|
||||
html = html.replace('\n\n', '<br/><br/>')
|
||||
html = urlize(html)
|
||||
return sanitize_fragment(html)
|
||||
|
||||
def sanitize_fragment(html):
|
||||
|
|
Loading…
Reference in a new issue