parse_html->sanitize_html, add links, allow mailto
This commit is contained in:
parent
e556447d1b
commit
f6b5d6bde8
1 changed files with 22 additions and 19 deletions
41
ox/html.py
41
ox/html.py
|
@ -164,7 +164,7 @@ def decodeHtml(html):
|
||||||
return match.group(0)
|
return match.group(0)
|
||||||
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
||||||
|
|
||||||
decode_hmtl = decodeHtml
|
decode_html = decodeHtml
|
||||||
|
|
||||||
def highlight(text, query, hlClass="hl"):
|
def highlight(text, query, hlClass="hl"):
|
||||||
"""
|
"""
|
||||||
|
@ -187,35 +187,37 @@ def escape_html(value):
|
||||||
>>> escape_html(u'<script> foo')
|
>>> escape_html(u'<script> foo')
|
||||||
u'<script> foo'
|
u'<script> foo'
|
||||||
'''
|
'''
|
||||||
return escape(decodeHtml(value))
|
return escape(decode_html(value))
|
||||||
|
|
||||||
def parse_html(html, tags=None, wikilinks=False):
|
def sanitize_html(html, tags=None, wikilinks=False):
|
||||||
'''
|
'''
|
||||||
>>> parse_html('http://foo.com, bar')
|
>>> sanitize_html('http://foo.com, bar')
|
||||||
'<a href="http://foo.com">http://foo.com</a>, bar'
|
'<a href="http://foo.com">http://foo.com</a>, bar'
|
||||||
>>> parse_html('http://foo.com/foobar?foo, bar')
|
>>> sanitize_html('http://foo.com/foobar?foo, bar')
|
||||||
'<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
|
'<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
|
||||||
>>> parse_html('(see: www.foo.com)')
|
>>> sanitize_html('(see: www.foo.com)')
|
||||||
'(see: <a href="http://www.foo.com">www.foo.com</a>)'
|
'(see: <a href="http://www.foo.com">www.foo.com</a>)'
|
||||||
>>> parse_html('foo@bar.com')
|
>>> sanitize_html('foo@bar.com')
|
||||||
'<a href="mailto:foo@bar.com">foo@bar.com</a>'
|
'<a href="mailto:foo@bar.com">foo@bar.com</a>'
|
||||||
>>> parse_html('<a href="http://foo.com" onmouseover="alert()">foo</a>')
|
>>> sanitize_html(sanitize_html('foo@bar.com'))
|
||||||
|
'<a href="mailto:foo@bar.com">foo@bar.com</a>'
|
||||||
|
>>> sanitize_html('<a href="http://foo.com" onmouseover="alert()">foo</a>')
|
||||||
'<a href="http://foo.com">foo</a>'
|
'<a href="http://foo.com">foo</a>'
|
||||||
>>> parse_html('<a href="javascript:alert()">foo</a>')
|
>>> sanitize_html('<a href="javascript:alert()">foo</a>')
|
||||||
'<a href="javascript:alert()">foo'
|
'<a href="javascript:alert()">foo'
|
||||||
>>> parse_html('[http://foo.com foo]')
|
>>> sanitize_html('[http://foo.com foo]')
|
||||||
'<a href="http://foo.com">foo</a>'
|
'<a href="http://foo.com">foo</a>'
|
||||||
>>> parse_html('<rtl>foo</rtl>')
|
>>> sanitize_html('<rtl>foo</rtl>')
|
||||||
'<div style="direction: rtl">foo</div>'
|
'<div style="direction: rtl">foo</div>'
|
||||||
>>> parse_html('<script>alert()</script>')
|
>>> sanitize_html('<script>alert()</script>')
|
||||||
'<script>alert()</script>'
|
'<script>alert()</script>'
|
||||||
>>> parse_html('\'foo\' < \'bar\' && "foo" > "bar"')
|
>>> sanitize_html('\'foo\' < \'bar\' && "foo" > "bar"')
|
||||||
'\'foo\' < \'bar\' && "foo" > "bar"'
|
'\'foo\' < \'bar\' && "foo" > "bar"'
|
||||||
>>> parse_html('<b>foo')
|
>>> sanitize_html('<b>foo')
|
||||||
'<b>foo</b>'
|
'<b>foo</b>'
|
||||||
>>> parse_html('<b>foo</b></b>')
|
>>> sanitize_html('<b>foo</b></b>')
|
||||||
'<b>foo</b>'
|
'<b>foo</b>'
|
||||||
>>> parse_html('Anniversary of Daoud's Republic')
|
>>> sanitize_html('Anniversary of Daoud's Republic')
|
||||||
'Anniversary of Daoud's Republic'
|
'Anniversary of Daoud's Republic'
|
||||||
'''
|
'''
|
||||||
if not tags:
|
if not tags:
|
||||||
|
@ -234,8 +236,8 @@ def parse_html(html, tags=None, wikilinks=False):
|
||||||
'rtl', '[]'
|
'rtl', '[]'
|
||||||
]
|
]
|
||||||
parse = {
|
parse = {
|
||||||
'a': {
|
'a': {
|
||||||
'<a [^<>]*?href="((https?:\/\/|\/).+?)".*?>': '<a href="{1}">',
|
'<a [^<>]*?href="((https?:\/\/|\/|mailto:).+?)".*?>': '<a href="{1}">',
|
||||||
'<\/a>': '</a>'
|
'<\/a>': '</a>'
|
||||||
},
|
},
|
||||||
'img': {
|
'img': {
|
||||||
|
@ -250,7 +252,7 @@ def parse_html(html, tags=None, wikilinks=False):
|
||||||
matches = []
|
matches = []
|
||||||
|
|
||||||
#makes parse_html output the same value if run twice
|
#makes parse_html output the same value if run twice
|
||||||
html = decodeHtml(html)
|
html = decode_html(html)
|
||||||
|
|
||||||
if '[]' in tags:
|
if '[]' in tags:
|
||||||
html = re.sub(
|
html = re.sub(
|
||||||
|
@ -278,6 +280,7 @@ def parse_html(html, tags=None, wikilinks=False):
|
||||||
for i in range(0, len(matches)):
|
for i in range(0, len(matches)):
|
||||||
html = html.replace('\t%d\t'%(i+1), matches[i])
|
html = html.replace('\t%d\t'%(i+1), matches[i])
|
||||||
html = html.replace('\n\n', '<br/><br/>')
|
html = html.replace('\n\n', '<br/><br/>')
|
||||||
|
html = urlize(html)
|
||||||
return sanitize_fragment(html)
|
return sanitize_fragment(html)
|
||||||
|
|
||||||
def sanitize_fragment(html):
|
def sanitize_fragment(html):
|
||||||
|
|
Loading…
Reference in a new issue