From b93750e32538ec1e6649ac0f7beaec7b9bca0de4 Mon Sep 17 00:00:00 2001
From: rolux <rolux@rolux.org>
Date: Sun, 27 May 2012 12:36:16 +0200
Subject: [PATCH] replace Ox.parseEmailAddresses and Ox.parseURLs with
 Ox.addLinks; rename Ox.parseHTML to Ox.sanitizeHTML; add Ox.normalizeHTML;
 update Ox.encodeHTMLEntities and Ox.decodeHTMLEntities so that they also
 cover the previous Ox.encodeHTML and Ox.decodeHTML

---
 source/Ox/js/HTML.js | 549 +++++++++++++++++++++++++++++++------------
 1 file changed, 398 insertions(+), 151 deletions(-)
diff --git a/source/Ox/js/HTML.js b/source/Ox/js/HTML.js
index deca8c05..4735a144 100644
--- a/source/Ox/js/HTML.js
+++ b/source/Ox/js/HTML.js
@@ -1,51 +1,7 @@
 'use strict';
 
-/*@
-Ox.parseEmailAddresses <f> Takes HTML and turns e-mail addresses into links
-    > Ox.parseEmailAddresses('test@pad.ma')
-    '<a href="mailto:test@pad.ma">test@pad.ma</a>'
-    > Ox.parseEmailAddresses('foo bar <foo@bar.com>')
-    'foo bar <<a href="mailto:foo@bar.com">foo@bar.com</a>>'
-    > Ox.parseEmailAddresses('foo bar <<a href="mailto:foo@bar.com">foo@bar.com</a>>')
-    'foo bar <<a href="mailto:foo@bar.com">foo@bar.com</a>>'
-@*/
-// fixme: shouldn't this be formatEmailAddresses?
-// fixme: fails for linked emails
-Ox.parseEmailAddresses = function(html) {
-    return html.replace(
-        /\b([0-9A-Z\.\+\-_]+@(?:[0-9A-Z\-]+\.)+[A-Z]{2,6})\b/gi,
-        '<a href="mailto:$1">$1</a>'
-    );
-};
+(function() {
 
-/*@
-Ox.parseHTML <f> Takes HTML from an untrusted source and returns something sane
-    > Ox.parseHTML('http://foo.com, bar')
-    '<a href="http://foo.com">http://foo.com</a>, bar'
-    > Ox.parseHTML('http://foo.com/foobar?foo, bar')
-    '<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
-    > Ox.parseHTML('(see: www.foo.com)')
-    '(see: <a href="http://www.foo.com">www.foo.com</a>)'
-    > Ox.parseHTML('foo@bar.com')
-    '<a href="mailto:foo@bar.com">foo@bar.com</a>'
-    > Ox.parseHTML('<a href="http://foo.com" onmouseover="alert()">foo</a>')
-    '<a href="http://foo.com">foo</a>'
-    > Ox.parseHTML('<a href="javascript:alert()">foo</a>')
-    '&lt;a href="javascript:alert()"&gt;foo'
-    > Ox.parseHTML('[http://foo.com foo]')
-    '<a href="http://foo.com">foo</a>'
-    > Ox.parseHTML('<rtl>foo</rtl>')
-    '<div style="direction: rtl">foo</div>'
-    > Ox.parseHTML('<script>alert()</script>')
-    '&lt;script&gt;alert()&lt;/script&gt;'
-    > Ox.parseHTML('\'foo\' < \'bar\' && "foo" > "bar"')
-    '\'foo\' &lt; \'bar\' &amp;&amp; "foo" &gt; "bar"'
-    > Ox.parseHTML('<b>foo')
-    '<b>foo</b>'
-    > Ox.parseHTML('<b>foo</b></b>')
-    '<b>foo</b>'
-@*/
-Ox.parseHTML = (function() {
     var defaultTags = [
             // inline formatting
             'b', 'code', 'i', 's', 'sub', 'sup', 'u',
@@ -60,117 +16,408 @@ Ox.parseHTML = (function() {
             // special
             'rtl', '[]'
         ],
-        parse = {
-            a: {
-                '<a [^<>]*?href="((https?:\/\/|\/).+?)".*?>': '<a href="{1}">',
-                '<\/a>': '</a>'
-            },
-            img: {
-                '<img [^<>]*?src="((https?:\/\/|\/).+?)".*?>': '<img src="{1}">'
-            },
-            rtl: {
-                '<rtl>': '<div style="direction: rtl">',
-                '<\/rtl>': '</div>'
-            },
-            '*': function(tag) {
-                var ret = {};
-                ret['<(/?' + tag + ') ?/?>'] = '<{1}>';
-                return ret;
-            }
+        htmlEntities = {
+            '"': '&quot;', '&': '&amp;', "'": '&apos;', '<': '&lt;', '>': '&gt;'
         },
-        tab = '\t';
-    return function(html, tags, wikilinks) {
-        var matches = [];
-        tags = tags || defaultTags;
-        // html = Ox.clean(html); fixme: can this be a parameter?
-        if (tags.indexOf('[]') > -1) {
-            html = html.replace(/\[((https?:\/\/|\/).+?) (.+?)\]/gi, '<a href="$1">$3</a>');
-            tags = tags.filter(function(tag) {
-                return tag != '[]';
-            });
-        }
-        tags.forEach(function(tag) {
-            var p = parse[tag] || parse['*'](tag);
-            Ox.forEach(p, function(replace, regexp) {
-                html = html.replace(new RegExp(regexp, 'gi'), function() {
-                    matches.push(Ox.formatString(replace, arguments));
-                    return tab + (matches.length - 1) + tab; 
-                });
-            });
+        regexp = {
+            entity: /&[^\s]*;/g,
+            tag: new RegExp('<\\/?(' + [
+                'a', 'b', 'br', 'code', 'i', 's', 'span', 'u'
+            ].join('|') + ')\\/?>', 'gi')
+        },
+        replace = {
+            mail: [
+                /\b([0-9A-Z\.\+\-_]+@(?:[0-9A-Z\-]+\.)+[A-Z]{2,6})\b/gi,
+                '<a href="mailto:$1">$1</a>'
+            ],
+            namedEntity: [
+                new RegExp('(' + Ox.values(htmlEntities).join('|') + ')', 'g'),
+                function(match) {
+                    return Ox.keyOf(htmlEntities, match);
+                }
+            ],
+            numericEntity: [
+                /&#([0-9A-FX]+);/gi,
+                function(match, code) {
+                    return Ox.char(
+                        /^X/i.test(code)
+                            ? parseInt(code.slice(1), 16)
+                            : parseInt(code, 10)
+                    );
+                }
+            ],
+            tag: {
+                a: [
+                    [
+                        /<a [^<>]*?href="((https?:\/\/|\/).+?)".*?>/gi,
+                        '<a href="{1}">',
+                    ],
+                    [
+                        /<\/a>/gi,
+                        '</a>'
+                    ]
+                ],
+                img: [
+                    [
+                        /<img [^<>]*?src="((https?:\/\/|\/).+?)".*?>/gi,
+                        '<img src="$1">'
+                    ]
+                ],
+                rtl: [
+                    [
+                        /<rtl>/gi,
+                        '<div style="direction: rtl">'
+                    ],
+                    [
+                        /<\/rtl>/gi,
+                        '</div>'
+                    ]
+                ],
+                '*': function(tag) {
+                    return [
+                        [
+                            new RegExp('</?' + tag + ' ?/?>', 'gi'),
+                            '{0}'
+                        ]
+                    ];
+                }
+            },
+            url: [
+                /\b((https?:\/\/|www\.).+?)([\.,:;!\?\)\]]*?(\s|$))/gi,
+                function(string, url, prefix, end) {
+                    prefix = prefix.toLowerCase() == 'www.' ? 'http://' : '';
+                    return Ox.formatString(
+                        '<a href="{prefix}{url}">{url}</a>{end}',
+                        {end: end, prefix: prefix, url: url}
+                    );
+                }
+            ]
+        },
+        salt = Ox.range(2).map(function(){
+            return Ox.range(16).map(function() {
+                return Ox.char(65 + Ox.random(26));
+            }).join('');
         });
-        html = Ox.encodeHTML(html);
-        //fixme: both fail if urls/emails are already links
-        //html = Ox.parseURLs(html);
-        //html = Ox.parseEmailAddresses(html);
-        matches.forEach(function(match, i) {
-            html = html.replace(new RegExp(tab + i + tab), match);
-        });
-        html = html.replace(/\n\n/g, '<br/><br/>');
-        // close extra opening (and remove extra closing) tags
-        // note: this converts '&quot;' to '"' 
-        return Ox.element('<div>').html(html).html();
-    };
-}());
 
-/*@
-Ox.parseURL <f> Takes a URL, returns its components
-    (url) -> <o> URL components
-    url <s> URL
-    <script>
-        Ox.test.object = Ox.parseURL('http://www.foo.com:8080/bar/index.html?a=0&b=1#c');
-    </script>
-    > Ox.test.object.hash
-    '#c'
-    > Ox.test.object.host
-    'www.foo.com:8080'
-    > Ox.test.object.hostname
-    'www.foo.com'
-    > Ox.test.object.origin
-    'http://www.foo.com:8080'
-    > Ox.test.object.pathname
-    '/bar/index.html'
-    > Ox.test.object.port
-    '8080'
-    > Ox.test.object.protocol
-    'http:'
-    > Ox.test.object.search
-    '?a=0&b=1'
-@*/
-Ox.parseURL = (function() {
-    var a = document.createElement('a'),
-        keys = ['hash', 'host', 'hostname', 'origin',
-            'pathname', 'port', 'protocol', 'search'];
-    return function(string) {
-        var ret = {};
-        a.href = string;
-        keys.forEach(function(key) {
-            ret[key] = a[key];
+    // Splits a string into text (even indices) and tags (odd indices), ignoring
+    // tags with starting positions that are included in the ignore array
+    function splitHTMLTags(string, ignore) {
+        var isTag = false, ret = [''];
+        ignore = ignore || [];
+        Ox.forEach(string, function(char, i) {
+            if (!isTag && char == '<' && ignore.indexOf(i) == -1) {
+                isTag = true;
+                ret.push('');
+            }
+            ret[ret.length - 1] += char;
+            if (isTag && char == '>') {
+                isTag = false;
+                ret.push('');
+            }
         });
         return ret;
     };
-}());
 
-/*@
-Ox.parseURLs <f> Takes HTML and turns URLs into links
-    > Ox.parseURLs('http://foo.com, bar')
-    '<a href="http://foo.com">http://foo.com</a>, bar'
-    > Ox.parseURLs('http://foo.com/foobar?foo, bar')
-    '<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
-    > Ox.parseURLs('www.foo.com, bar')
-    '<a href="http://www.foo.com">www.foo.com</a>, bar'
-    > Ox.parseURLs('<a href="http://foo.com">http://foo.com</a> etc')
-    '<a href="http://foo.com">http://foo.com</a> etc'
-@*/
-// fixme: shouldn't this be formatURLs?
-// fixme: fails for urls inside links
-Ox.parseURLs = function(html) {
-    return html.replace(
-        /\b((https?:\/\/|www\.).+?)([\.,:;!\?\)\]]*?(\s|$))/gi,
-        function(string, url, prefix, end) {
-            url = (prefix == 'www.' ? 'http://' : '') + url;
-            return Ox.formatString(
-                '<a href="{url}">{url}</a>{end}', {end: end, url: url}
-            );
+    /*@
+    Ox.addLinks <f> Takes a string and adds links for e-mail addresses and URLs
+        (string[, isHTML]) -> <s> Formatted string
+        string <s> String
+        isHTML <b|false> If true, ignore matches in tags or enclosed by links
+        > Ox.addLinks('foo bar <foo@bar.com>')
+        'foo bar &lt;<a href="mailto:foo@bar.com">foo@bar.com</a>&gt;'
+        > Ox.addLinks('www.foo.com/bar#baz, etc.')
+        '<a href="http://www.foo.com/bar#baz">www.foo.com/bar#baz</a>, etc.'
+        > Ox.addLinks('<a href="http://foo.com">foo.com</a>', true)
+        '<a href="http://foo.com">foo.com</a>'
+    @*/
+    Ox.addLinks = function(string, isHTML) {
+        var isLink = false;
+        function replaceString(string) {
+            return string
+                .replace(replace.mail[0], replace.mail[1])
+                .replace(replace.url[0], replace.url[1]);
         }
-    );
-};
+        return isHTML
+            ? splitHTMLTags(string).map(function(string, i) {
+                var isTag = i % 2;
+                if (isTag) {
+                    if (/^<a/.test(string)) {
+                        isLink = true;
+                    } else if (/^<\/a/.test(string)) {
+                        isLink = false;
+                    }
+                }
+                return isTag || isLink ? string : replaceString(string); 
+            }).join('')
+            : Ox.normalizeHTML(replaceString(string));
+    };
+
+    /*@
+    Ox.encodeHTMLEntities <f> Encodes HTML entities
+        (string[, encodeAll]) -> <s> String
+        string <s> String
+        encodeAll <b|false> If true, encode characters > 127 as numeric entities
+        > Ox.encodeHTMLEntities('<\'&"> äbçdê')
+        '&lt;&apos;&amp;&quot;&gt; äbçdê'
+        > Ox.encodeHTMLEntities('<\'&"> äbçdê', true)
+        '&lt;&apos;&amp;&quot;&gt; &#x00E4;b&#x00E7;d&#x00EA;'
+    @*/
+    Ox.encodeHTMLEntities = function(string, encodeAll) {
+        return Ox.map(String(string), function(char) {
+            var code = char.charCodeAt(0);
+            if (code < 128) {
+                char = char in htmlEntities ? htmlEntities[char] : char;
+            } else if (encodeAll) {
+                char = '&#x' + Ox.pad(code.toString(16).toUpperCase(), 4) + ';';
+            }
+            return char;
+        });
+    };
+
+    /*@
+    Ox.decodeHTMLEntities <f> Decodes HTML entities
+        (string[, decodeAll]) -> <s> String
+        string <s> String
+        decodeAll <b|false> If true, decode named entities for characters > 127
+            Note that <code>decodeAll</code> relies on
+            <code>Ox.normalizeHTML</code>, which uses the DOM and may transform
+            the string
+        > Ox.decodeHTMLEntities('&#x003C;&#x0027;&#x0026;&#x0022;&#x003E;')
+        '<\'&">'
+        > Ox.decodeHTMLEntities('&lt;&apos;&amp;&quot;&gt;')
+        '<\'&">'
+        > Ox.decodeHTMLEntities('&#x00E4;b&#x00E7;d&#x00EA;')
+        'äbçdê'
+        > Ox.decodeHTMLEntities('&auml;b&ccedil;d&ecirc;')
+        '&auml;b&ccedil;d&ecirc;'
+        > Ox.decodeHTMLEntities('&auml;b&ccedil;d&ecirc;', true)
+        'äbçdê'
+        > Ox.decodeHTMLEntities('<b>&beta;')
+        '<b>&beta;'
+        > Ox.decodeHTMLEntities('<b>&beta;', true)
+        '<b>β</b>'
+        > Ox.decodeHTMLEntities('&lt;b&gt;')
+        '<b>'
+    @*/
+    Ox.decodeHTMLEntities = function(string, decodeAll) {
+        return decodeAll
+            ? Ox.decodeHTMLEntities(Ox.normalizeHTML(string))
+            : String(string)
+                .replace(replace.namedEntity[0], replace.namedEntity[1])
+                .replace(replace.numericEntity[0], replace.numericEntity[1]);
+    };
+
+    /*@
+    Ox.highlightHTML <f> Highlight matches in string
+        (string, query, classname[, isHTML]) -> Output string
+        string <s> Input string
+        query <r|s> Case-insentitive query string, or regular expression
+        classname <s> Class name for matches
+        isHTML <b|false> If true, the input string is treated as HTML
+        > Ox.highlightHTML('<name>', 'name', 'c')
+        '&lt;<span class="c">name</span>&gt;'
+        > Ox.highlightHTML('<span class="name">name</span>', 'name', 'c', true)
+        '<span class="name"><span class="c">name</span></span>'
+        > Ox.highlightHTML('amp &amp; amp', 'amp', 'c', true)
+        '<span class="c">amp</span> &amp; <span class="c">amp</span>'
+        > Ox.highlightHTML('amp &amp; amp', 'amp & amp', 'c', true)
+        '<span class="c">amp &amp; amp</span>'
+        > Ox.highlightHTML('<b>&lt;b&gt;</b>', '<b>', 'c', true)
+        '<span class="c"><b>&lt;b&gt;</b></span>'
+        > Ox.highlightHTML('<b>&lt;b&gt;</b>', '&lt;b&gt;', 'c', true)
+        '<b>&lt;b&gt;</b>'
+        > Ox.highlightHTML('foo<b>bar</b>baz', 'foobar', 'c', true)
+        '<span class="c">foo<b>bar</b></span>baz'
+        > Ox.highlightHTML('foo<p>bar</p>baz', 'foobar', 'c', true)
+        'foo<p>bar</p>baz'
+        > Ox.highlightHTML('foo <br/>bar baz', 'foo bar', 'c', true)
+        '<span class="c">foo <br>bar</span> baz'
+    @*/
+    Ox.highlightHTML = function(string, query, classname, isHTML) {
+        var cursor = 0,
+            entities = [],
+            matches = [],
+            re = Ox.isRegExp(query) ? query
+                : new RegExp(Ox.escapeRegExp(query), 'gi'),
+            span = ['<span class="' + classname + '">', '</span>'],
+            tags = [];
+        function insert(array) {
+            // for each replacement
+            array.forEach(function(v) {
+                // replace the modified value with the original value
+                string = Ox.splice(string, v.position, v.length, v.value);
+                // for each match
+                matches.forEach(function(match) {
+                    if (v.position < match.position) {
+                        // replacement is before match, update match position
+                        match.position += v.value.length - v.length;
+                    } else if (
+                        v.position < match.position + match.value.length
+                    ) {
+                        // replacement is inside match, update match value
+                        match.value = Ox.splice(
+                            match.value, v.position - match.position, v.length,
+                            v.value
+                        );
+                    }
+                });
+            });
+        }
+        if (isHTML) {
+            string = Ox.normalizeHTML(string)
+                // remove inline tags
+                .replace(regexp.tag, function(value, tag, position) {
+                    tags.push({
+                        length: 0, position: position, value: value
+                    });
+                    return '';
+                })
+                // decode html entities
+                .replace(regexp.entity, function(value, position) {
+                    var ret = Ox.decodeHTMLEntities(value, true);
+                    entities.push({
+                        length: ret.length, position: position, value: value
+                    });
+                    return ret;
+                });
+            // if decoding entities has created new tags, ignore them
+            splitHTMLTags(string, entities.map(function(entity) {
+                return entity.position;
+            })).forEach(function(v, i) {
+                if (i % 2 == 0) {
+                    // outside tags, find matches and save position and value
+                    v.replace(re, function(value, position) {
+                        matches.push(
+                            {position: cursor + position, value: value}
+                        );
+                    });
+                }
+                cursor += v.length;
+            });
+            insert(entities);
+            insert(tags);
+            // for each match (in reverse order, so that positions are correct)
+            matches.reverse().forEach(function(match) {
+                // wrap it in a span
+                string = Ox.splice(
+                    string, match.position, match.value.length,
+                    span.join(match.value)
+                );
+            });
+            
+        } else {
+            string = string.replace(re, function(value) {
+                return span.join(value);
+            });
+        }
+        // if isHTML, we may have enclosed single opening or closing tags in a 
+        // span, if not isHTML, the string may contain '<', '>' or '&', so in
+        // both cases, we have to normalize
+        return Ox.normalizeHTML(string);
+    };
+
+    /*@
+    Ox.normalizeHTML <f> Normalize HTML (using the DOM)
+        > Ox.normalizeHTML('<b>foo')
+        '<b>foo</b>'
+        > Ox.normalizeHTML('<b>foo</b></b>')
+        '<b>foo</b>'
+        > Ox.normalizeHTML('&lt;&apos;&amp;&quot;&gt; &#x00E4;b&#x00E7;d&#x00EA;')
+        '&lt;\'&amp;"&gt; äbçdê'
+    @*/
+    Ox.normalizeHTML = function(html) {
+        return Ox.$('<div>').html(html).html();
+    };
+
+    /*@
+    Ox.sanitizeHTML <f> Takes untrusted HTML and returns something trustworthy
+        > Ox.sanitizeHTML('http://foo.com, bar')
+        '<a href="http://foo.com">http://foo.com</a>, bar'
+        > Ox.sanitizeHTML('http://foo.com/foobar?foo, bar')
+        '<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
+        > Ox.sanitizeHTML('(see: www.foo.com)')
+        '(see: <a href="http://www.foo.com">www.foo.com</a>)'
+        > Ox.sanitizeHTML('foo@bar.com')
+        '<a href="mailto:foo@bar.com">foo@bar.com</a>'
+        > Ox.sanitizeHTML('<a href="http://foo.com" onclick="alert()">foo</a>')
+        '<a href="http://foo.com">foo</a>'
+        > Ox.sanitizeHTML('<a href="javascript:alert()">foo</a>')
+        '&lt;a href="javascript:alert()"&gt;foo'
+        > Ox.sanitizeHTML('[http://foo.com foo]')
+        '<a href="http://foo.com">foo</a>'
+        > Ox.sanitizeHTML('<rtl>foo</rtl>')
+        '<div style="direction: rtl">foo</div>'
+        > Ox.sanitizeHTML('<script>alert()</script>')
+        '&lt;script&gt;alert()&lt;/script&gt;'
+        > Ox.sanitizeHTML('\'foo\' < \'bar\' && "foo" > "bar"')
+        '\'foo\' &lt; \'bar\' &amp;&amp; "foo" &gt; "bar"'
+        > Ox.sanitizeHTML('<b>foo')
+        '<b>foo</b>'
+        > Ox.sanitizeHTML('<b>foo</b></b>')
+        '<b>foo</b>'
+    @*/
+    Ox.sanitizeHTML = (function() {
+        var defaultTags = [
+                // inline formatting
+                'b', 'code', 'i', 's', 'sub', 'sup', 'u',
+                // block formatting
+                'blockquote', 'h1', 'h2', 'h3', 'p', 'pre',
+                // lists
+                'li', 'ol', 'ul',
+                // tables
+                'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr',
+                // other
+                'a', 'br', 'img',
+                // special
+                'rtl', '[]'
+            ],
+            parse = {
+                a: {
+                    '<a [^<>]*?href="((https?:\/\/|\/).+?)".*?>': '<a href="{1}">',
+                    '<\/a>': '</a>'
+                },
+                img: {
+                    '<img [^<>]*?src="((https?:\/\/|\/).+?)".*?>': '<img src="{1}">'
+                },
+                rtl: {
+                    '<rtl>': '<div style="direction: rtl">',
+                    '<\/rtl>': '</div>'
+                },
+                '*': function(tag) {
+                    var ret = {};
+                    ret['<(/?' + tag + ') ?/?>'] = '<{1}>';
+                    return ret;
+                }
+            },
+            tab = '\t';
+        return function(html, tags, wikilinks) {
+            var matches = [];
+            tags = tags || defaultTags;
+            // html = Ox.clean(html); fixme: can this be a parameter?
+            if (tags.indexOf('[]') > -1) {
+                html = html.replace(/\[((https?:\/\/|\/).+?) (.+?)\]/gi, '<a href="$1">$3</a>');
+                tags = tags.filter(function(tag) {
+                    return tag != '[]';
+                });
+            }
+            tags.forEach(function(tag) {
+                var array = replace.tag[tag] || replace.tag['*'](tag);
+                Ox.forEach(array, function(value) {
+                    html = html.replace(value[0], function() {
+                        matches.push(Ox.formatString(value[1], arguments));
+                        return salt.join(matches.length - 1);
+                    });
+                });
+            });
+            html = Ox.addLinks(Ox.encodeHTMLEntities(html));
+            matches.forEach(function(match, i) {
+                html = html.replace(new RegExp(salt.join(i)), match);
+            });
+            html = html.replace(/\n\n/g, '<br/><br/>');
+            // Close extra opening and remove extra closing tags.
+            // Note: this converts '&apos;' to "'" and '&quot;' to '"'
+            return Ox.normalizeHTML(html);
+        };
+    }());
+
+}());