replace Ox.parseEmailAddresses and Ox.parseURLs with Ox.addLinks; rename Ox.parseHTML to Ox.sanitizeHTML; add Ox.normalizeHTML; update Ox.encodeHTMLEntities and Ox.decodeHTMLEntities so that they also cover the previous Ox.encodeHTML and Ox.decodeHTML

This commit is contained in:
rolux 2012-05-27 12:36:16 +02:00
parent f94395d652
commit b93750e325

View file

@ -1,51 +1,362 @@
'use strict';
/*@
Ox.parseEmailAddresses <f> Takes HTML and turns e-mail addresses into links
> Ox.parseEmailAddresses('test@pad.ma')
'<a href="mailto:test@pad.ma">test@pad.ma</a>'
> Ox.parseEmailAddresses('foo bar <foo@bar.com>')
'foo bar <<a href="mailto:foo@bar.com">foo@bar.com</a>>'
> Ox.parseEmailAddresses('foo bar <<a href="mailto:foo@bar.com">foo@bar.com</a>>')
'foo bar <<a href="mailto:foo@bar.com">foo@bar.com</a>>'
@*/
// fixme: shouldn't this be formatEmailAddresses?
// fixme: fails for linked emails
Ox.parseEmailAddresses = function(html) {
return html.replace(
(function() {
var defaultTags = [
// inline formatting
'b', 'code', 'i', 's', 'sub', 'sup', 'u',
// block formatting
'blockquote', 'h1', 'h2', 'h3', 'p', 'pre',
// lists
'li', 'ol', 'ul',
// tables
'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr',
// other
'a', 'br', 'img',
// special
'rtl', '[]'
],
htmlEntities = {
'"': '&quot;', '&': '&amp;', "'": '&apos;', '<': '&lt;', '>': '&gt;'
},
regexp = {
entity: /&[^\s]*;/g,
tag: new RegExp('<\\/?(' + [
'a', 'b', 'br', 'code', 'i', 's', 'span', 'u'
].join('|') + ')\\/?>', 'gi')
},
replace = {
mail: [
/\b([0-9A-Z\.\+\-_]+@(?:[0-9A-Z\-]+\.)+[A-Z]{2,6})\b/gi,
'<a href="mailto:$1">$1</a>'
],
namedEntity: [
new RegExp('(' + Ox.values(htmlEntities).join('|') + ')', 'g'),
function(match) {
return Ox.keyOf(htmlEntities, match);
}
],
numericEntity: [
/&#([0-9A-FX]+);/gi,
function(match, code) {
return Ox.char(
/^X/i.test(code)
? parseInt(code.slice(1), 16)
: parseInt(code, 10)
);
};
}
],
tag: {
a: [
[
/<a [^<>]*?href="((https?:\/\/|\/).+?)".*?>/gi,
'<a href="{1}">',
],
[
/<\/a>/gi,
'</a>'
]
],
img: [
[
/<img [^<>]*?src="((https?:\/\/|\/).+?)".*?>/gi,
'<img src="$1">'
]
],
rtl: [
[
/<rtl>/gi,
'<div style="direction: rtl">'
],
[
/<\/rtl>/gi,
'</div>'
]
],
'*': function(tag) {
return [
[
new RegExp('</?' + tag + ' ?/?>', 'gi'),
'{0}'
]
];
}
},
url: [
/\b((https?:\/\/|www\.).+?)([\.,:;!\?\)\]]*?(\s|$))/gi,
function(string, url, prefix, end) {
prefix = prefix.toLowerCase() == 'www.' ? 'http://' : '';
return Ox.formatString(
'<a href="{prefix}{url}">{url}</a>{end}',
{end: end, prefix: prefix, url: url}
);
}
]
},
salt = Ox.range(2).map(function(){
return Ox.range(16).map(function() {
return Ox.char(65 + Ox.random(26));
}).join('');
});
/*@
Ox.parseHTML <f> Takes HTML from an untrusted source and returns something sane
> Ox.parseHTML('http://foo.com, bar')
// Splits a string into text (even indices) and tags (odd indices), ignoring
// tags with starting positions that are included in the ignore array
function splitHTMLTags(string, ignore) {
var isTag = false, ret = [''];
ignore = ignore || [];
Ox.forEach(string, function(char, i) {
if (!isTag && char == '<' && ignore.indexOf(i) == -1) {
isTag = true;
ret.push('');
}
ret[ret.length - 1] += char;
if (isTag && char == '>') {
isTag = false;
ret.push('');
}
});
return ret;
};
/*@
Ox.addLinks <f> Takes a string and adds links for e-mail addresses and URLs
(string[, isHTML]) -> <s> Formatted string
string <s> String
isHTML <b|false> If true, ignore matches in tags or enclosed by links
> Ox.addLinks('foo bar <foo@bar.com>')
'foo bar &lt;<a href="mailto:foo@bar.com">foo@bar.com</a>&gt;'
> Ox.addLinks('www.foo.com/bar#baz, etc.')
'<a href="http://www.foo.com/bar#baz">www.foo.com/bar#baz</a>, etc.'
> Ox.addLinks('<a href="http://foo.com">foo.com</a>', true)
'<a href="http://foo.com">foo.com</a>'
@*/
Ox.addLinks = function(string, isHTML) {
var isLink = false;
function replaceString(string) {
return string
.replace(replace.mail[0], replace.mail[1])
.replace(replace.url[0], replace.url[1]);
}
return isHTML
? splitHTMLTags(string).map(function(string, i) {
var isTag = i % 2;
if (isTag) {
if (/^<a/.test(string)) {
isLink = true;
} else if (/^<\/a/.test(string)) {
isLink = false;
}
}
return isTag || isLink ? string : replaceString(string);
}).join('')
: Ox.normalizeHTML(replaceString(string));
};
/*@
Ox.encodeHTMLEntities <f> Encodes HTML entities
(string[, encodeAll]) -> <s> String
string <s> String
encodeAll <b|false> If true, encode characters > 127 as numeric entities
> Ox.encodeHTMLEntities('<\'&"> äbçdê')
'&lt;&apos;&amp;&quot;&gt; äbçdê'
> Ox.encodeHTMLEntities('<\'&"> äbçdê', true)
'&lt;&apos;&amp;&quot;&gt; &#x00E4;b&#x00E7;d&#x00EA;'
@*/
Ox.encodeHTMLEntities = function(string, encodeAll) {
return Ox.map(String(string), function(char) {
var code = char.charCodeAt(0);
if (code < 128) {
char = char in htmlEntities ? htmlEntities[char] : char;
} else if (encodeAll) {
char = '&#x' + Ox.pad(code.toString(16).toUpperCase(), 4) + ';';
}
return char;
});
};
/*@
Ox.decodeHTMLEntities <f> Decodes HTML entities
(string[, decodeAll]) -> <s> String
string <s> String
decodeAll <b|false> If true, decode named entities for characters > 127
Note that <code>decodeAll</code> relies on
<code>Ox.normalizeHTML</code>, which uses the DOM and may transform
the string
> Ox.decodeHTMLEntities('&#x003C;&#x0027;&#x0026;&#x0022;&#x003E;')
'<\'&">'
> Ox.decodeHTMLEntities('&lt;&apos;&amp;&quot;&gt;')
'<\'&">'
> Ox.decodeHTMLEntities('&#x00E4;b&#x00E7;d&#x00EA;')
'äbçdê'
> Ox.decodeHTMLEntities('&auml;b&ccedil;d&ecirc;')
'&auml;b&ccedil;d&ecirc;'
> Ox.decodeHTMLEntities('&auml;b&ccedil;d&ecirc;', true)
'äbçdê'
> Ox.decodeHTMLEntities('<b>&beta;')
'<b>&beta;'
> Ox.decodeHTMLEntities('<b>&beta;', true)
'<b>β</b>'
> Ox.decodeHTMLEntities('&lt;b&gt;')
'<b>'
@*/
Ox.decodeHTMLEntities = function(string, decodeAll) {
return decodeAll
? Ox.decodeHTMLEntities(Ox.normalizeHTML(string))
: String(string)
.replace(replace.namedEntity[0], replace.namedEntity[1])
.replace(replace.numericEntity[0], replace.numericEntity[1]);
};
/*@
Ox.highlightHTML <f> Highlight matches in string
(string, query, classname[, isHTML]) -> Output string
string <s> Input string
query <r|s> Case-insentitive query string, or regular expression
classname <s> Class name for matches
isHTML <b|false> If true, the input string is treated as HTML
> Ox.highlightHTML('<name>', 'name', 'c')
'&lt;<span class="c">name</span>&gt;'
> Ox.highlightHTML('<span class="name">name</span>', 'name', 'c', true)
'<span class="name"><span class="c">name</span></span>'
> Ox.highlightHTML('amp &amp; amp', 'amp', 'c', true)
'<span class="c">amp</span> &amp; <span class="c">amp</span>'
> Ox.highlightHTML('amp &amp; amp', 'amp & amp', 'c', true)
'<span class="c">amp &amp; amp</span>'
> Ox.highlightHTML('<b>&lt;b&gt;</b>', '<b>', 'c', true)
'<span class="c"><b>&lt;b&gt;</b></span>'
> Ox.highlightHTML('<b>&lt;b&gt;</b>', '&lt;b&gt;', 'c', true)
'<b>&lt;b&gt;</b>'
> Ox.highlightHTML('foo<b>bar</b>baz', 'foobar', 'c', true)
'<span class="c">foo<b>bar</b></span>baz'
> Ox.highlightHTML('foo<p>bar</p>baz', 'foobar', 'c', true)
'foo<p>bar</p>baz'
> Ox.highlightHTML('foo <br/>bar baz', 'foo bar', 'c', true)
'<span class="c">foo <br>bar</span> baz'
@*/
Ox.highlightHTML = function(string, query, classname, isHTML) {
var cursor = 0,
entities = [],
matches = [],
re = Ox.isRegExp(query) ? query
: new RegExp(Ox.escapeRegExp(query), 'gi'),
span = ['<span class="' + classname + '">', '</span>'],
tags = [];
function insert(array) {
// for each replacement
array.forEach(function(v) {
// replace the modified value with the original value
string = Ox.splice(string, v.position, v.length, v.value);
// for each match
matches.forEach(function(match) {
if (v.position < match.position) {
// replacement is before match, update match position
match.position += v.value.length - v.length;
} else if (
v.position < match.position + match.value.length
) {
// replacement is inside match, update match value
match.value = Ox.splice(
match.value, v.position - match.position, v.length,
v.value
);
}
});
});
}
if (isHTML) {
string = Ox.normalizeHTML(string)
// remove inline tags
.replace(regexp.tag, function(value, tag, position) {
tags.push({
length: 0, position: position, value: value
});
return '';
})
// decode html entities
.replace(regexp.entity, function(value, position) {
var ret = Ox.decodeHTMLEntities(value, true);
entities.push({
length: ret.length, position: position, value: value
});
return ret;
});
// if decoding entities has created new tags, ignore them
splitHTMLTags(string, entities.map(function(entity) {
return entity.position;
})).forEach(function(v, i) {
if (i % 2 == 0) {
// outside tags, find matches and save position and value
v.replace(re, function(value, position) {
matches.push(
{position: cursor + position, value: value}
);
});
}
cursor += v.length;
});
insert(entities);
insert(tags);
// for each match (in reverse order, so that positions are correct)
matches.reverse().forEach(function(match) {
// wrap it in a span
string = Ox.splice(
string, match.position, match.value.length,
span.join(match.value)
);
});
} else {
string = string.replace(re, function(value) {
return span.join(value);
});
}
// if isHTML, we may have enclosed single opening or closing tags in a
// span, if not isHTML, the string may contain '<', '>' or '&', so in
// both cases, we have to normalize
return Ox.normalizeHTML(string);
};
/*@
Ox.normalizeHTML <f> Normalize HTML (using the DOM)
> Ox.normalizeHTML('<b>foo')
'<b>foo</b>'
> Ox.normalizeHTML('<b>foo</b></b>')
'<b>foo</b>'
> Ox.normalizeHTML('&lt;&apos;&amp;&quot;&gt; &#x00E4;b&#x00E7;d&#x00EA;')
'&lt;\'&amp;"&gt; äbçdê'
@*/
Ox.normalizeHTML = function(html) {
return Ox.$('<div>').html(html).html();
};
/*@
Ox.sanitizeHTML <f> Takes untrusted HTML and returns something trustworthy
> Ox.sanitizeHTML('http://foo.com, bar')
'<a href="http://foo.com">http://foo.com</a>, bar'
> Ox.parseHTML('http://foo.com/foobar?foo, bar')
> Ox.sanitizeHTML('http://foo.com/foobar?foo, bar')
'<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
> Ox.parseHTML('(see: www.foo.com)')
> Ox.sanitizeHTML('(see: www.foo.com)')
'(see: <a href="http://www.foo.com">www.foo.com</a>)'
> Ox.parseHTML('foo@bar.com')
> Ox.sanitizeHTML('foo@bar.com')
'<a href="mailto:foo@bar.com">foo@bar.com</a>'
> Ox.parseHTML('<a href="http://foo.com" onmouseover="alert()">foo</a>')
> Ox.sanitizeHTML('<a href="http://foo.com" onclick="alert()">foo</a>')
'<a href="http://foo.com">foo</a>'
> Ox.parseHTML('<a href="javascript:alert()">foo</a>')
> Ox.sanitizeHTML('<a href="javascript:alert()">foo</a>')
'&lt;a href="javascript:alert()"&gt;foo'
> Ox.parseHTML('[http://foo.com foo]')
> Ox.sanitizeHTML('[http://foo.com foo]')
'<a href="http://foo.com">foo</a>'
> Ox.parseHTML('<rtl>foo</rtl>')
> Ox.sanitizeHTML('<rtl>foo</rtl>')
'<div style="direction: rtl">foo</div>'
> Ox.parseHTML('<script>alert()</script>')
> Ox.sanitizeHTML('<script>alert()</script>')
'&lt;script&gt;alert()&lt;/script&gt;'
> Ox.parseHTML('\'foo\' < \'bar\' && "foo" > "bar"')
> Ox.sanitizeHTML('\'foo\' < \'bar\' && "foo" > "bar"')
'\'foo\' &lt; \'bar\' &amp;&amp; "foo" &gt; "bar"'
> Ox.parseHTML('<b>foo')
> Ox.sanitizeHTML('<b>foo')
'<b>foo</b>'
> Ox.parseHTML('<b>foo</b></b>')
> Ox.sanitizeHTML('<b>foo</b></b>')
'<b>foo</b>'
@*/
Ox.parseHTML = (function() {
@*/
Ox.sanitizeHTML = (function() {
var defaultTags = [
// inline formatting
'b', 'code', 'i', 's', 'sub', 'sup', 'u',
@ -90,87 +401,23 @@ Ox.parseHTML = (function() {
});
}
tags.forEach(function(tag) {
var p = parse[tag] || parse['*'](tag);
Ox.forEach(p, function(replace, regexp) {
html = html.replace(new RegExp(regexp, 'gi'), function() {
matches.push(Ox.formatString(replace, arguments));
return tab + (matches.length - 1) + tab;
var array = replace.tag[tag] || replace.tag['*'](tag);
Ox.forEach(array, function(value) {
html = html.replace(value[0], function() {
matches.push(Ox.formatString(value[1], arguments));
return salt.join(matches.length - 1);
});
});
});
html = Ox.encodeHTML(html);
//fixme: both fail if urls/emails are already links
//html = Ox.parseURLs(html);
//html = Ox.parseEmailAddresses(html);
html = Ox.addLinks(Ox.encodeHTMLEntities(html));
matches.forEach(function(match, i) {
html = html.replace(new RegExp(tab + i + tab), match);
html = html.replace(new RegExp(salt.join(i)), match);
});
html = html.replace(/\n\n/g, '<br/><br/>');
// close extra opening (and remove extra closing) tags
// note: this converts '&quot;' to '"'
return Ox.element('<div>').html(html).html();
// Close extra opening and remove extra closing tags.
// Note: this converts '&apos;' to "'" and '&quot;' to '"'
return Ox.normalizeHTML(html);
};
}());
}());
/*@
Ox.parseURL <f> Takes a URL, returns its components
(url) -> <o> URL components
url <s> URL
<script>
Ox.test.object = Ox.parseURL('http://www.foo.com:8080/bar/index.html?a=0&b=1#c');
</script>
> Ox.test.object.hash
'#c'
> Ox.test.object.host
'www.foo.com:8080'
> Ox.test.object.hostname
'www.foo.com'
> Ox.test.object.origin
'http://www.foo.com:8080'
> Ox.test.object.pathname
'/bar/index.html'
> Ox.test.object.port
'8080'
> Ox.test.object.protocol
'http:'
> Ox.test.object.search
'?a=0&b=1'
@*/
Ox.parseURL = (function() {
var a = document.createElement('a'),
keys = ['hash', 'host', 'hostname', 'origin',
'pathname', 'port', 'protocol', 'search'];
return function(string) {
var ret = {};
a.href = string;
keys.forEach(function(key) {
ret[key] = a[key];
});
return ret;
};
}());
/*@
Ox.parseURLs <f> Takes HTML and turns URLs into links
> Ox.parseURLs('http://foo.com, bar')
'<a href="http://foo.com">http://foo.com</a>, bar'
> Ox.parseURLs('http://foo.com/foobar?foo, bar')
'<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
> Ox.parseURLs('www.foo.com, bar')
'<a href="http://www.foo.com">www.foo.com</a>, bar'
> Ox.parseURLs('<a href="http://foo.com">http://foo.com</a> etc')
'<a href="http://foo.com">http://foo.com</a> etc'
@*/
// fixme: shouldn't this be formatURLs?
// fixme: fails for urls inside links
Ox.parseURLs = function(html) {
return html.replace(
/\b((https?:\/\/|www\.).+?)([\.,:;!\?\)\]]*?(\s|$))/gi,
function(string, url, prefix, end) {
url = (prefix == 'www.' ? 'http://' : '') + url;
return Ox.formatString(
'<a href="{url}">{url}</a>{end}', {end: end, url: url}
);
}
);
};