Ox.tokenize, Ox.SyntaxHighlighter (+demo)

2011-04-28 20:34:19 +02:00 · 2011-04-28 20:34:19 +02:00 · 74b9a25387
commit 74b9a25387
parent ee9f698b29
8 changed files with 632 additions and 0 deletions
--- a/source/Ox.js
+++ b/source/Ox.js
@ -2701,6 +2701,286 @@ Ox.toDashes = function(str) {
    });
 };

+Ox.tokenize = (function() {
+
+    // see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js
+
+    var identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_',
+        // see https://developer.mozilla.org/en/JavaScript/Reference/Reserved_Words
+        linebreak = '\n\r',
+        number = '0123456789',
+        // see https://developer.mozilla.org/en/JavaScript/Reference
+        operator = [
+            // arithmetic
+            '+', '-', '*', '/', '%', '++', '--',
+            // assignment
+            '=', '+=', '-=', '*=', '/=', '%=',
+            '&=', '|=', '^=', '<<=', '>>=', '>>>=',
+            // bitwise
+            '&', '|', '^', '~', '<<', '>>', '>>>',
+            // comparison
+            '==', '!=', '===', '!==', '>', '>=', '<', '<=',
+            // conditional
+            '?', ':',
+            // grouping
+            '(', ')', '[', ']', '{', '}',
+            // logical
+            '&&', '||', '!',
+            // other
+            '.', ',', ';'
+        ],
+        whitespace = ' \t',
+        word = {
+            constant: [
+                // Math
+                'E', 'LN2', 'LN10', 'LOG2E', 'LOG10E', 'PI', 'SQRT1_2', 'SQRT2',
+                // Number
+                'MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'POSITIVE_INFINITY'
+            ],
+            keyword: [
+                'break',
+                'case', 'catch', 'class', 'const', 'continue',
+                'debugger', 'default', 'delete', 'do',
+                'else', 'enum', 'export', 'extends',
+                'false', 'finally', 'for', 'function',
+                'if', 'implements', 'import', 'in', 'instanceof', 'interface',
+                'let', 'module',
+                'new', 'null',
+                'package', 'private', 'protected', 'public',
+                'return',
+                'super', 'switch', 'static',
+                'this', 'throw', 'true', 'try', 'typeof',
+                'var', 'void',
+                'yield',
+                'while', 'with',
+            ],
+            method: [
+                // Array
+                'concat',
+                'every',
+                'filter', 'forEach',
+                'join',
+                'lastIndexOf',
+                'indexOf', 'isArray',
+                'map',
+                'pop', 'push',
+                'reduce', 'reduceRight', 'reverse',
+                'shift', 'slice', 'some', 'sort', 'splice',
+                'unshift',
+                // Date
+                'getDate', 'getDay', 'getFullYear', 'getHours', 'getMilliseconds',
+                'getMinutes', 'getMonth', 'getSeconds', 'getTime', 'getTimezoneOffset',
+                'getUTCDate', 'getUTCDay', 'getUTCFullYear', 'getUTCHours', 'getUTCMilliseconds',
+                'getUTCMinutes', 'getUTCMonth', 'getUTCSeconds',
+                'now',
+                'parse',
+                'setDate', 'setFullYear', 'setHours', 'setMilliseconds', 'setMinutes',
+                'setMonth', 'setSeconds', 'setTime',
+                'setUTCDate', 'setUTCFullYear', 'setUTCHours', 'setUTCMilliseconds', 'setUTCMinutes',
+                'setUTCMonth', 'setUTCSeconds',
+                'toDateString', 'toJSON', 'toLocaleDateString', 'toLocaleString', 'toLocaleTimeString',
+                'toTimeString', 'toUTCString',
+                'UTC',
+                // Function
+                'apply', 'bind', 'call', 'isGenerator',
+                // JSON
+                'parse', 'stringify',
+                // Math
+                'abs', 'acos', 'asin', 'atan', 'atan2',
+                'ceil', 'cos',
+                'exp',
+                'floor',
+                'log',
+                'max', 'min',
+                'pow',
+                'random', 'round',
+                'sin', 'sqrt',
+                'tan',
+                // Number
+                'toExponential', 'toFixed', 'toLocaleString', 'toPrecision',
+                // Object
+                'create',
+                'defineProperty', 'defineProperties',
+                'freeze',
+                'getOwnPropertyDescriptor', 'getOwnPropertyNames', 'getPrototypeOf',
+                'hasOwnProperty',
+                'isExtensible', 'isFrozen', 'isPrototypeOf', 'isSealed',
+                'keys',
+                'preventExtensions', 'propertyIsEnumerable',
+                'seal',
+                'toLocaleString', 'toString',
+                'valueOf',
+                // RegExp
+                'exec', 'test',
+                // String
+                'charAt', 'charCodeAt', 'concat',
+                'fromCharCode',
+                'indexOf',
+                'lastIndexOf', 'localeCompare',
+                'match',
+                'replace',
+                'search', 'slice', 'split', 'substr', 'substring',
+                'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toUpperCase', 'trim'
+            ],
+            object: [
+                'Array',
+                'Boolean',
+                'Date', 'decodeURI', 'decodeURIComponent',
+                'encodeURI', 'encodeURIComponent', 'Error', 'eval', 'EvalError',
+                'Function',
+                'Infinity', 'isFinite', 'isNaN',
+                'JSON',
+                'Math',
+                'NaN', 'Number',
+                'Object',
+                'parseFloat', 'parseInt',
+                'RangeError', 'ReferenceError', 'RegExp',
+                'String', 'SyntaxError',
+                'TypeError',
+                'undefined', 'URIError'
+            ],
+            property: [
+                // Function
+                'constructor', 'length', 'prototype',
+                // RegExp
+                'global', 'ignoreCase', 'lastIndex', 'multiline', 'source'
+            ]
+        };
+
+    return function(source) {
+
+        //source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
+
+        var cursor = 0,
+            tokenize = {
+                comment: function() {
+                    while (char = source[++cursor]) {
+                        if (next == '/' && char == '\n') {
+                            break;
+                        } else if (next == '*' && char == '*' && source[cursor + 1] == '/') {
+                            cursor += 2;
+                            break;
+                        }
+                    }
+                },
+                identifier: function() {
+                    var str;
+                    while (identifier.indexOf(source[++cursor]) > -1) {}
+                    str = source.substring(start, cursor);
+                    Ox.forEach(word, function(value, key) {
+                        if (value.indexOf(str) > -1) {
+                            type = key;
+                            return false;
+                        }
+                    });
+                },
+                linebreak: function() {
+                    while (linebreak.indexOf(source[++cursor]) > -1) {}
+                },
+                number: function() {
+                    while ((number + '.').indexOf(source[++cursor]) > -1) {}
+                },
+                operator: function() {
+                    if (operator.indexOf(char + source[++cursor]) > -1) {
+                        if (operator.indexOf(char + next + source[++cursor]) > 1) {
+                            ++cursor;
+                        }
+                    }
+                },
+                regexp: function() {
+                    while ((char = source[++cursor]) != '/') {
+                        char == '\\' && ++cursor;
+                        if (cursor == source.length) {
+                            break;
+                        }
+                    }
+                    while (identifier.indexOf(source[++cursor]) > -1) {}
+                },
+                string: function() {
+                    var delimiter = char;
+                    while ((char = source[++cursor]) != delimiter) {
+                        char == '\\' &&  ++cursor;
+                        if (cursor == source.length) {
+                            break;
+                        }
+                    }
+                    ++cursor;
+                },
+                whitespace: function() {
+                    while (whitespace.indexOf(source[++cursor]) > -1) {}
+                }
+            },
+            tokens = [],
+            type;
+
+        while (cursor < source.length) {
+            var char = source[cursor],
+                next = source[cursor + 1],
+                start = cursor;
+            if (char == '/' && (next == '/' || next == '*')) {
+                type = 'comment';
+            } else if (identifier.indexOf(char) > -1) {
+                type = 'identifier';
+            } else if (linebreak.indexOf(char) > -1) {
+                type = 'linebreak';
+            } else if (number.indexOf(char) > -1) {
+                type = 'number';
+            } else if (char == "'" || char == '"') {
+                type = 'string';
+            } else if (whitespace.indexOf(char) > -1) {
+                type = 'whitespace';
+            } else if (char == '/') {
+                type = isRegExp() ? 'regexp' : 'operator';
+            } else if (operator.indexOf(char) > -1) {
+                type = 'operator';
+            }
+            tokenize[type]();
+            tokens.push({
+                length: cursor - start,
+                type: type,
+            });
+        }
+
+        function isRegExp() {
+            // checks if a forward slash is the beginning of a regexp,
+            // as opposed to the beginning of an operator
+            // see http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html#regular-expressions
+            var index = tokens.length,
+                isRegExp = false
+                offset = 0;
+            // scan back to the previous significant token,
+            // or the beginning of the source
+            while (
+                typeof tokens[--index] != 'undefined' &&
+                ['comment', 'linebreak', 'whitespace'].indexOf(tokens[index].type) > -1
+            ) {
+                offset += tokens[index].length;
+            }
+            if (typeof tokens[index] == 'undefined') {
+                // source begins with forward slash
+                isRegExp = true;
+            } else {
+                prevToken = tokens[index];
+                prevString = source.substr(cursor - prevToken.length - offset, prevToken.length);
+                Ox.print('forward slash |', prevToken, prevToken.type, '"'+prevString+'"');
+                isRegExp = (
+                    prevToken.type == 'keyword' &&
+                    ['false', 'null', 'true'].indexOf(prevString) == -1
+                ) || (
+                    prevToken.type == 'operator' && 
+                    ['++', '--', ')', ']', '}'].indexOf(prevString) == -1
+                );
+            }
+            return isRegExp;
+        }
+
+        return tokens;
+
+    };
+
+}()); 
+
+
 Ox.toSlashes = function(str) {
    /*
    >>> Ox.toSlashes("fooBarBaz")