escape_html/parse_html
This commit is contained in:
parent
52afe24428
commit
60839de99c
1 changed files with 103 additions and 1 deletions
104
ox/html.py
104
ox/html.py
|
@ -34,7 +34,7 @@ def escape(html):
|
||||||
'''
|
'''
|
||||||
if not isinstance(html, basestring):
|
if not isinstance(html, basestring):
|
||||||
html = str(html)
|
html = str(html)
|
||||||
return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
|
return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
|
||||||
|
|
||||||
def linebreaks(value):
|
def linebreaks(value):
|
||||||
'''
|
'''
|
||||||
|
@ -174,3 +174,105 @@ def highlight(text, query, hlClass="hl"):
|
||||||
text = text.replace('|', '<br />')
|
text = text.replace('|', '<br />')
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def escape_html(value):
|
||||||
|
'''
|
||||||
|
>>> escape_html(u'<script> foo')
|
||||||
|
u'<script> foo'
|
||||||
|
>>> escape_html(u'<script> foo')
|
||||||
|
u'<script> foo'
|
||||||
|
'''
|
||||||
|
return escape(decodeHtml(value))
|
||||||
|
|
||||||
|
def parse_html(html, tags=None, wikilinks=False):
|
||||||
|
'''
|
||||||
|
>>> parse_html('http://foo.com, bar')
|
||||||
|
'<a href="http://foo.com">http://foo.com</a>, bar'
|
||||||
|
>>> parse_html('http://foo.com/foobar?foo, bar')
|
||||||
|
'<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
|
||||||
|
>>> parse_html('(see: www.foo.com)')
|
||||||
|
'(see: <a href="http://www.foo.com">www.foo.com</a>)'
|
||||||
|
>>> parse_html('foo@bar.com')
|
||||||
|
'<a href="mailto:foo@bar.com">foo@bar.com</a>'
|
||||||
|
>>> parse_html('<a href="http://foo.com" onmouseover="alert()">foo</a>')
|
||||||
|
'<a href="http://foo.com">foo</a>'
|
||||||
|
>>> parse_html('<a href="javascript:alert()">foo</a>')
|
||||||
|
'<a href="javascript:alert()">foo'
|
||||||
|
>>> parse_html('[http://foo.com foo]')
|
||||||
|
'<a href="http://foo.com">foo</a>'
|
||||||
|
>>> parse_html('<rtl>foo</rtl>')
|
||||||
|
'<div style="direction: rtl">foo</div>'
|
||||||
|
>>> parse_html('<script>alert()</script>')
|
||||||
|
'<script>alert()</script>'
|
||||||
|
>>> parse_html('\'foo\' < \'bar\' && "foo" > "bar"')
|
||||||
|
'\'foo\' < \'bar\' && "foo" > "bar"'
|
||||||
|
>>> parse_html('<b>foo')
|
||||||
|
'<b>foo</b>'
|
||||||
|
>>> parse_html('<b>foo</b></b>')
|
||||||
|
'<b>foo</b>'
|
||||||
|
'''
|
||||||
|
if not tags:
|
||||||
|
tags = [
|
||||||
|
# inline formatting
|
||||||
|
'b', 'code', 'i', 's', 'sub', 'sup', 'u',
|
||||||
|
# block formatting
|
||||||
|
'blockquote', 'h1', 'h2', 'h3', 'p', 'pre',
|
||||||
|
# lists
|
||||||
|
'li', 'ol', 'ul',
|
||||||
|
# tables
|
||||||
|
'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr',
|
||||||
|
# other
|
||||||
|
'a', 'br', 'img',
|
||||||
|
# special
|
||||||
|
'rtl', '[]'
|
||||||
|
]
|
||||||
|
parse = {
|
||||||
|
'a': {
|
||||||
|
'<a [^<>]*?href="((https?:\/\/|\/).+?)".*?>': '<a href="{1}">',
|
||||||
|
'<\/a>': '</a>'
|
||||||
|
},
|
||||||
|
'img': {
|
||||||
|
'<img [^<>]*?src="((https?:\/\/|\/).+?)".*?>': '<img src="{1}">'
|
||||||
|
},
|
||||||
|
'rtl': {
|
||||||
|
'<rtl>': '<div style="direction: rtl">',
|
||||||
|
'<\/rtl>': '</div>'
|
||||||
|
},
|
||||||
|
'*': lambda tag: {'<(/?' + tag + ') ?/?>':'<{1}>'}
|
||||||
|
}
|
||||||
|
matches = []
|
||||||
|
|
||||||
|
#makes parse_html output the same value if run twice
|
||||||
|
html = decodeHtml(html)
|
||||||
|
|
||||||
|
if '[]' in tags:
|
||||||
|
html = re.sub(
|
||||||
|
re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
|
||||||
|
'<a href="\\1">\\3</a>', html);
|
||||||
|
tags = filter(lambda tag: tag != '[]', tags)
|
||||||
|
|
||||||
|
def replace_match(match, value, replace):
|
||||||
|
i = 1
|
||||||
|
for m in match.groups():
|
||||||
|
value = value.replace('{%d}'%i, m)
|
||||||
|
i += 1
|
||||||
|
matches.append(value)
|
||||||
|
return '\t%d\t' % len(matches)
|
||||||
|
|
||||||
|
for tag in tags:
|
||||||
|
p = parse.get(tag, parse['*'](tag))
|
||||||
|
for replace in p:
|
||||||
|
html = re.sub(
|
||||||
|
re.compile(replace, re.IGNORECASE),
|
||||||
|
lambda match: replace_match(match, p[replace][:], replace),
|
||||||
|
html
|
||||||
|
)
|
||||||
|
html = escape(html)
|
||||||
|
for i in range(0, len(matches)):
|
||||||
|
html = html.replace('\t%d\t'%(i+1), matches[i])
|
||||||
|
html = html.replace('\n\n', '<br/><br/>')
|
||||||
|
return sanitize_fragment(html)
|
||||||
|
|
||||||
|
def sanitize_fragment(html):
|
||||||
|
import html5lib
|
||||||
|
return html5lib.parseFragment(html).toxml().decode('utf-8')
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue