From b93750e32538ec1e6649ac0f7beaec7b9bca0de4 Mon Sep 17 00:00:00 2001 From: rolux Date: Sun, 27 May 2012 12:36:16 +0200 Subject: [PATCH] replace Ox.parseEmailAddresses and Ox.parseURLs with Ox.addLinks; rename Ox.parseHTML to Ox.sanitizeHTML; add Ox.normalizeHTML; update Ox.encodeHTMLEntities and Ox.decodeHTMLEntities so that they also cover the previous Ox.encodeHTML and Ox.decodeHTML --- source/Ox/js/HTML.js | 549 +++++++++++++++++++++++++++++++------------ 1 file changed, 398 insertions(+), 151 deletions(-) diff --git a/source/Ox/js/HTML.js b/source/Ox/js/HTML.js index deca8c05..4735a144 100644 --- a/source/Ox/js/HTML.js +++ b/source/Ox/js/HTML.js @@ -1,51 +1,7 @@ 'use strict'; -/*@ -Ox.parseEmailAddresses Takes HTML and turns e-mail addresses into links - > Ox.parseEmailAddresses('test@pad.ma') - 'test@pad.ma' - > Ox.parseEmailAddresses('foo bar ') - 'foo bar <foo@bar.com>' - > Ox.parseEmailAddresses('foo bar <foo@bar.com>') - 'foo bar <foo@bar.com>' -@*/ -// fixme: shouldn't this be formatEmailAddresses? -// fixme: fails for linked emails -Ox.parseEmailAddresses = function(html) { - return html.replace( - /\b([0-9A-Z\.\+\-_]+@(?:[0-9A-Z\-]+\.)+[A-Z]{2,6})\b/gi, - '$1' - ); -}; +(function() { -/*@ -Ox.parseHTML Takes HTML from an untrusted source and returns something sane - > Ox.parseHTML('http://foo.com, bar') - 'http://foo.com, bar' - > Ox.parseHTML('http://foo.com/foobar?foo, bar') - 'http://foo.com/foobar?foo, bar' - > Ox.parseHTML('(see: www.foo.com)') - '(see: www.foo.com)' - > Ox.parseHTML('foo@bar.com') - 'foo@bar.com' - > Ox.parseHTML('foo') - 'foo' - > Ox.parseHTML('foo') - '<a href="javascript:alert()">foo' - > Ox.parseHTML('[http://foo.com foo]') - 'foo' - > Ox.parseHTML('foo') - '
foo
' - > Ox.parseHTML('') - '<script>alert()</script>' - > Ox.parseHTML('\'foo\' < \'bar\' && "foo" > "bar"') - '\'foo\' < \'bar\' && "foo" > "bar"' - > Ox.parseHTML('foo') - 'foo' - > Ox.parseHTML('foo') - 'foo' -@*/ -Ox.parseHTML = (function() { var defaultTags = [ // inline formatting 'b', 'code', 'i', 's', 'sub', 'sup', 'u', @@ -60,117 +16,408 @@ Ox.parseHTML = (function() { // special 'rtl', '[]' ], - parse = { - a: { - ']*?href="((https?:\/\/|\/).+?)".*?>': '', - '<\/a>': '' - }, - img: { - ']*?src="((https?:\/\/|\/).+?)".*?>': '' - }, - rtl: { - '': '
', - '<\/rtl>': '
' - }, - '*': function(tag) { - var ret = {}; - ret['<(/?' + tag + ') ?/?>'] = '<{1}>'; - return ret; - } + htmlEntities = { + '"': '"', '&': '&', "'": ''', '<': '<', '>': '>' }, - tab = '\t'; - return function(html, tags, wikilinks) { - var matches = []; - tags = tags || defaultTags; - // html = Ox.clean(html); fixme: can this be a parameter? - if (tags.indexOf('[]') > -1) { - html = html.replace(/\[((https?:\/\/|\/).+?) (.+?)\]/gi, '$3'); - tags = tags.filter(function(tag) { - return tag != '[]'; - }); - } - tags.forEach(function(tag) { - var p = parse[tag] || parse['*'](tag); - Ox.forEach(p, function(replace, regexp) { - html = html.replace(new RegExp(regexp, 'gi'), function() { - matches.push(Ox.formatString(replace, arguments)); - return tab + (matches.length - 1) + tab; - }); - }); + regexp = { + entity: /&[^\s]*;/g, + tag: new RegExp('<\\/?(' + [ + 'a', 'b', 'br', 'code', 'i', 's', 'span', 'u' + ].join('|') + ')\\/?>', 'gi') + }, + replace = { + mail: [ + /\b([0-9A-Z\.\+\-_]+@(?:[0-9A-Z\-]+\.)+[A-Z]{2,6})\b/gi, + '$1' + ], + namedEntity: [ + new RegExp('(' + Ox.values(htmlEntities).join('|') + ')', 'g'), + function(match) { + return Ox.keyOf(htmlEntities, match); + } + ], + numericEntity: [ + /&#([0-9A-FX]+);/gi, + function(match, code) { + return Ox.char( + /^X/i.test(code) + ? parseInt(code.slice(1), 16) + : parseInt(code, 10) + ); + } + ], + tag: { + a: [ + [ + /]*?href="((https?:\/\/|\/).+?)".*?>/gi, + '', + ], + [ + /<\/a>/gi, + '' + ] + ], + img: [ + [ + /]*?src="((https?:\/\/|\/).+?)".*?>/gi, + '' + ] + ], + rtl: [ + [ + //gi, + '
' + ], + [ + /<\/rtl>/gi, + '
' + ] + ], + '*': function(tag) { + return [ + [ + new RegExp('', 'gi'), + '{0}' + ] + ]; + } + }, + url: [ + /\b((https?:\/\/|www\.).+?)([\.,:;!\?\)\]]*?(\s|$))/gi, + function(string, url, prefix, end) { + prefix = prefix.toLowerCase() == 'www.' ? 'http://' : ''; + return Ox.formatString( + '{url}{end}', + {end: end, prefix: prefix, url: url} + ); + } + ] + }, + salt = Ox.range(2).map(function(){ + return Ox.range(16).map(function() { + return Ox.char(65 + Ox.random(26)); + }).join(''); }); - html = Ox.encodeHTML(html); - //fixme: both fail if urls/emails are already links - //html = Ox.parseURLs(html); - //html = Ox.parseEmailAddresses(html); - matches.forEach(function(match, i) { - html = html.replace(new RegExp(tab + i + tab), match); - }); - html = html.replace(/\n\n/g, '

'); - // close extra opening (and remove extra closing) tags - // note: this converts '"' to '"' - return Ox.element('
').html(html).html(); - }; -}()); -/*@ -Ox.parseURL Takes a URL, returns its components - (url) -> URL components - url URL - - > Ox.test.object.hash - '#c' - > Ox.test.object.host - 'www.foo.com:8080' - > Ox.test.object.hostname - 'www.foo.com' - > Ox.test.object.origin - 'http://www.foo.com:8080' - > Ox.test.object.pathname - '/bar/index.html' - > Ox.test.object.port - '8080' - > Ox.test.object.protocol - 'http:' - > Ox.test.object.search - '?a=0&b=1' -@*/ -Ox.parseURL = (function() { - var a = document.createElement('a'), - keys = ['hash', 'host', 'hostname', 'origin', - 'pathname', 'port', 'protocol', 'search']; - return function(string) { - var ret = {}; - a.href = string; - keys.forEach(function(key) { - ret[key] = a[key]; + // Splits a string into text (even indices) and tags (odd indices), ignoring + // tags with starting positions that are included in the ignore array + function splitHTMLTags(string, ignore) { + var isTag = false, ret = ['']; + ignore = ignore || []; + Ox.forEach(string, function(char, i) { + if (!isTag && char == '<' && ignore.indexOf(i) == -1) { + isTag = true; + ret.push(''); + } + ret[ret.length - 1] += char; + if (isTag && char == '>') { + isTag = false; + ret.push(''); + } }); return ret; }; -}()); -/*@ -Ox.parseURLs Takes HTML and turns URLs into links - > Ox.parseURLs('http://foo.com, bar') - 'http://foo.com, bar' - > Ox.parseURLs('http://foo.com/foobar?foo, bar') - 'http://foo.com/foobar?foo, bar' - > Ox.parseURLs('www.foo.com, bar') - 'www.foo.com, bar' - > Ox.parseURLs('http://foo.com etc') - 'http://foo.com etc' -@*/ -// fixme: shouldn't this be formatURLs? -// fixme: fails for urls inside links -Ox.parseURLs = function(html) { - return html.replace( - /\b((https?:\/\/|www\.).+?)([\.,:;!\?\)\]]*?(\s|$))/gi, - function(string, url, prefix, end) { - url = (prefix == 'www.' ? 'http://' : '') + url; - return Ox.formatString( - '{url}{end}', {end: end, url: url} - ); + /*@ + Ox.addLinks Takes a string and adds links for e-mail addresses and URLs + (string[, isHTML]) -> Formatted string + string String + isHTML If true, ignore matches in tags or enclosed by links + > Ox.addLinks('foo bar ') + 'foo bar <foo@bar.com>' + > Ox.addLinks('www.foo.com/bar#baz, etc.') + 'www.foo.com/bar#baz, etc.' + > Ox.addLinks('foo.com', true) + 'foo.com' + @*/ + Ox.addLinks = function(string, isHTML) { + var isLink = false; + function replaceString(string) { + return string + .replace(replace.mail[0], replace.mail[1]) + .replace(replace.url[0], replace.url[1]); } - ); -}; + return isHTML + ? splitHTMLTags(string).map(function(string, i) { + var isTag = i % 2; + if (isTag) { + if (/^ Encodes HTML entities + (string[, encodeAll]) -> String + string String + encodeAll If true, encode characters > 127 as numeric entities + > Ox.encodeHTMLEntities('<\'&"> äbçdê') + '<'&"> äbçdê' + > Ox.encodeHTMLEntities('<\'&"> äbçdê', true) + '<'&"> äbçdê' + @*/ + Ox.encodeHTMLEntities = function(string, encodeAll) { + return Ox.map(String(string), function(char) { + var code = char.charCodeAt(0); + if (code < 128) { + char = char in htmlEntities ? htmlEntities[char] : char; + } else if (encodeAll) { + char = '&#x' + Ox.pad(code.toString(16).toUpperCase(), 4) + ';'; + } + return char; + }); + }; + + /*@ + Ox.decodeHTMLEntities Decodes HTML entities + (string[, decodeAll]) -> String + string String + decodeAll If true, decode named entities for characters > 127 + Note that decodeAll relies on + Ox.normalizeHTML, which uses the DOM and may transform + the string + > Ox.decodeHTMLEntities('<'&">') + '<\'&">' + > Ox.decodeHTMLEntities('<'&">') + '<\'&">' + > Ox.decodeHTMLEntities('äbçdê') + 'äbçdê' + > Ox.decodeHTMLEntities('äbçdê') + 'äbçdê' + > Ox.decodeHTMLEntities('äbçdê', true) + 'äbçdê' + > Ox.decodeHTMLEntities('β') + 'β' + > Ox.decodeHTMLEntities('β', true) + 'β' + > Ox.decodeHTMLEntities('<b>') + '' + @*/ + Ox.decodeHTMLEntities = function(string, decodeAll) { + return decodeAll + ? Ox.decodeHTMLEntities(Ox.normalizeHTML(string)) + : String(string) + .replace(replace.namedEntity[0], replace.namedEntity[1]) + .replace(replace.numericEntity[0], replace.numericEntity[1]); + }; + + /*@ + Ox.highlightHTML Highlight matches in string + (string, query, classname[, isHTML]) -> Output string + string Input string + query Case-insentitive query string, or regular expression + classname Class name for matches + isHTML If true, the input string is treated as HTML + > Ox.highlightHTML('', 'name', 'c') + '<name>' + > Ox.highlightHTML('name', 'name', 'c', true) + 'name' + > Ox.highlightHTML('amp & amp', 'amp', 'c', true) + 'amp & amp' + > Ox.highlightHTML('amp & amp', 'amp & amp', 'c', true) + 'amp & amp' + > Ox.highlightHTML('<b>', '', 'c', true) + '<b>' + > Ox.highlightHTML('<b>', '<b>', 'c', true) + '<b>' + > Ox.highlightHTML('foobarbaz', 'foobar', 'c', true) + 'foobarbaz' + > Ox.highlightHTML('foo

bar

baz', 'foobar', 'c', true) + 'foo

bar

baz' + > Ox.highlightHTML('foo
bar baz', 'foo bar', 'c', true) + 'foo
bar
baz' + @*/ + Ox.highlightHTML = function(string, query, classname, isHTML) { + var cursor = 0, + entities = [], + matches = [], + re = Ox.isRegExp(query) ? query + : new RegExp(Ox.escapeRegExp(query), 'gi'), + span = ['', ''], + tags = []; + function insert(array) { + // for each replacement + array.forEach(function(v) { + // replace the modified value with the original value + string = Ox.splice(string, v.position, v.length, v.value); + // for each match + matches.forEach(function(match) { + if (v.position < match.position) { + // replacement is before match, update match position + match.position += v.value.length - v.length; + } else if ( + v.position < match.position + match.value.length + ) { + // replacement is inside match, update match value + match.value = Ox.splice( + match.value, v.position - match.position, v.length, + v.value + ); + } + }); + }); + } + if (isHTML) { + string = Ox.normalizeHTML(string) + // remove inline tags + .replace(regexp.tag, function(value, tag, position) { + tags.push({ + length: 0, position: position, value: value + }); + return ''; + }) + // decode html entities + .replace(regexp.entity, function(value, position) { + var ret = Ox.decodeHTMLEntities(value, true); + entities.push({ + length: ret.length, position: position, value: value + }); + return ret; + }); + // if decoding entities has created new tags, ignore them + splitHTMLTags(string, entities.map(function(entity) { + return entity.position; + })).forEach(function(v, i) { + if (i % 2 == 0) { + // outside tags, find matches and save position and value + v.replace(re, function(value, position) { + matches.push( + {position: cursor + position, value: value} + ); + }); + } + cursor += v.length; + }); + insert(entities); + insert(tags); + // for each match (in reverse order, so that positions are correct) + matches.reverse().forEach(function(match) { + // wrap it in a span + string = Ox.splice( + string, match.position, match.value.length, + span.join(match.value) + ); + }); + + } else { + string = string.replace(re, function(value) { + return span.join(value); + }); + } + // if isHTML, we may have enclosed single opening or closing tags in a + // span, if not isHTML, the string may contain '<', '>' or '&', so in + // both cases, we have to normalize + return Ox.normalizeHTML(string); + }; + + /*@ + Ox.normalizeHTML Normalize HTML (using the DOM) + > Ox.normalizeHTML('foo') + 'foo' + > Ox.normalizeHTML('foo') + 'foo' + > Ox.normalizeHTML('<'&"> äbçdê') + '<\'&"> äbçdê' + @*/ + Ox.normalizeHTML = function(html) { + return Ox.$('
').html(html).html(); + }; + + /*@ + Ox.sanitizeHTML Takes untrusted HTML and returns something trustworthy + > Ox.sanitizeHTML('http://foo.com, bar') + 'http://foo.com, bar' + > Ox.sanitizeHTML('http://foo.com/foobar?foo, bar') + 'http://foo.com/foobar?foo, bar' + > Ox.sanitizeHTML('(see: www.foo.com)') + '(see: www.foo.com)' + > Ox.sanitizeHTML('foo@bar.com') + 'foo@bar.com' + > Ox.sanitizeHTML('foo') + 'foo' + > Ox.sanitizeHTML('foo') + '<a href="javascript:alert()">foo' + > Ox.sanitizeHTML('[http://foo.com foo]') + 'foo' + > Ox.sanitizeHTML('foo') + '
foo
' + > Ox.sanitizeHTML('') + '<script>alert()</script>' + > Ox.sanitizeHTML('\'foo\' < \'bar\' && "foo" > "bar"') + '\'foo\' < \'bar\' && "foo" > "bar"' + > Ox.sanitizeHTML('foo') + 'foo' + > Ox.sanitizeHTML('foo') + 'foo' + @*/ + Ox.sanitizeHTML = (function() { + var defaultTags = [ + // inline formatting + 'b', 'code', 'i', 's', 'sub', 'sup', 'u', + // block formatting + 'blockquote', 'h1', 'h2', 'h3', 'p', 'pre', + // lists + 'li', 'ol', 'ul', + // tables + 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', + // other + 'a', 'br', 'img', + // special + 'rtl', '[]' + ], + parse = { + a: { + ']*?href="((https?:\/\/|\/).+?)".*?>': '', + '<\/a>': '' + }, + img: { + ']*?src="((https?:\/\/|\/).+?)".*?>': '' + }, + rtl: { + '': '
', + '<\/rtl>': '
' + }, + '*': function(tag) { + var ret = {}; + ret['<(/?' + tag + ') ?/?>'] = '<{1}>'; + return ret; + } + }, + tab = '\t'; + return function(html, tags, wikilinks) { + var matches = []; + tags = tags || defaultTags; + // html = Ox.clean(html); fixme: can this be a parameter? + if (tags.indexOf('[]') > -1) { + html = html.replace(/\[((https?:\/\/|\/).+?) (.+?)\]/gi, '$3'); + tags = tags.filter(function(tag) { + return tag != '[]'; + }); + } + tags.forEach(function(tag) { + var array = replace.tag[tag] || replace.tag['*'](tag); + Ox.forEach(array, function(value) { + html = html.replace(value[0], function() { + matches.push(Ox.formatString(value[1], arguments)); + return salt.join(matches.length - 1); + }); + }); + }); + html = Ox.addLinks(Ox.encodeHTMLEntities(html)); + matches.forEach(function(match, i) { + html = html.replace(new RegExp(salt.join(i)), match); + }); + html = html.replace(/\n\n/g, '

'); + // Close extra opening and remove extra closing tags. + // Note: this converts ''' to "'" and '"' to '"' + return Ox.normalizeHTML(html); + }; + }()); + +}());