improve Ox.tokenize, add Ox.identify

2012-05-26 12:54:52 +02:00 · 2012-05-26 12:54:52 +02:00 · 5a2ecca0f1
commit 5a2ecca0f1
parent 08aedd34fd
1 changed files with 275 additions and 325 deletions
--- a/source/Ox/js/JavaScript.js
+++ b/source/Ox/js/JavaScript.js
@ -134,10 +134,9 @@ Ox.doc = (function() {
            tokens = [];
        Ox.tokenize(source).forEach(function(token) {
            var match;
-            token.source = source.substr(token.offset, token.length);
            if (token.type == 'comment' && (
-                match = re.multiline.exec(token.source)
-                || re.singleline.exec(token.source)
+                match = re.multiline.exec(token.value)
+                || re.singleline.exec(token.value)
            )) {
                blocks.push(match[1]);
                tokens.push([]);
@ -162,7 +161,9 @@ Ox.doc = (function() {
                if (/^[A-Z]/.test(item.name)) {
                    // main item
                    // include leading whitespace
-                    item.source = parseTokens(tokens[i]);
+                    item.source = parseTokens(tokens[i]).map(function(token) {
+                        return token.value;
+                    }).join('');
                    item.line = source.slice(0, item.source[0].offset)
                        .split('\n').length;
                    items.push(item);
@ -304,215 +305,21 @@ Ox.doc = (function() {
 }());

 /*@
-Ox.minify <f> Minifies JavaScript
-    (source) -> <s> Minified JavaScript
-    (file, callback) -> <u> undefined
-    source <s> JavaScript source
-    file <s> JavaScript file
-    callback <f> Callback function
-    > Ox.minify('for (a in b)\n{\t\tc = void 0;\n}')
-    'for(a in b)\n{c=void 0;}'
-    > Ox.minify('return a; return 0; return "";')
-    'return a;return 0;return"";'
-    > Ox.minify('return\na;\nreturn\n0;\nreturn\n"";')
-    'return\na;return\n0;return\n"";'
+Ox.identify <f> Returns the type of a JavaScript identifier
+    (str) -> <s> Type
+        Type can be <code>constant</code>, <code>identifier</code>,
+        <code>keyword</code>, <code>method</code>, <code>object</code> or
+        <code>property</code>
@*/
-Ox.minify = function() {
-    // see https://github.com/douglascrockford/JSMin/blob/master/README
-    // and http://inimino.org/~inimino/blog/javascript_semicolons
-    if (arguments.length == 1) {
-        return minify(arguments[0]);
-    } else {
-        Ox.get(arguments[0], function(source) {
-            arguments[1](minify(source));
-        });
-    }
-    function isCommentOrLinebreakOrWhitespace(token) {
-        return token.type == 'comment' || isLinebreakOrWhitespace(token);
-    }
-    function isIdentifierOrNumber(token) {
-        return Ox.contains([
-            'constant', 'identifier', 'keyword',
-            'number', 'method', 'object', 'property'
-        ], token.type);
-    }
-    function isIdentifierOrNumberOrString(token) {
-        return isIdentifierOrNumber(token) || token.type == 'string';
-    }
-    function isLinebreakOrWhitespace(token) {
-        return Ox.contains(['linebreak', 'whitespace'], token.type);
-    }
-    function minify(source) {
-        var tokens = Ox.tokenize(source),
-            length = tokens.length,
-            ret = '';
-        function getValue(token) {
-            return source.substr(token.offset, token.length);
-        }
-        tokens.forEach(function(token, i) {
-            var next, nextToken, previousToken;
-            if (isLinebreakOrWhitespace(token)) {
-                previousToken = i == 0 ? null : tokens[i - 1];
-                next = i + 1;
-                while (
-                    next < length
-                    && isCommentOrLinebreakOrWhitespace(tokens[next])
-                ) {
-                    next++;
-                }
-                nextToken = next == length ? null : tokens[next];
-            }
-            if (token.type == 'linebreak') {
-                // replace a linebreak between two tokens that are identifiers
-                // or numbers or strings or unary operators or grouping
-                // operators with a single newline, otherwise remove it
-                if (
-                    previousToken && nextToken && (
-                        isIdentifierOrNumberOrString(previousToken)
-                        || Ox.contains([
-                            '++', '--', ')', ']', '}'
-                        ], getValue(previousToken))
-                    ) && (
-                        isIdentifierOrNumberOrString(nextToken)
-                        || Ox.contains([
-                            '+', '-', '++', '--', '~', '!', '(', '[', '{'
-                        ], getValue(nextToken))
-                    )
-                ) {
-                    ret += '\n';
-                }
-            } else if (token.type == 'whitespace') {
-                // replace whitespace between two tokens that are identifiers or
-                // numbers, or between a token that ends with "+" or "-" and one
-                // that begins with "+" or "-", with a single space, otherwise
-                // remove it
-                if (
-                    previousToken && nextToken && ((
-                        isIdentifierOrNumber(previousToken)
-                        && isIdentifierOrNumber(nextToken)
-                    ) || (
-                        Ox.contains([
-                           '+', '-', '++', '--'
-                        ], getValue(previousToken))
-                        && Ox.contains([
-                           '+', '-', '++', '--'
-                        ], getValue(nextToken))
-                    ))
-                ) {
-                    ret += ' ';
-                }
-            } else if (token.type != 'comment') {
-                // remove comments and leave all other tokens untouched
-                ret += getValue(token);
-            }
-        });
-        return ret;
-    }
-};
-
-/*@
-Ox.test <f> Takes JavaScript, runs inline tests, returns results
-@*/
-Ox.test = function(file, callback) {
-    Ox.doc(file, function(items) {
-        var tests = [];
-        items.forEach(function(item) {
-            item.examples && item.examples.some(function(example) {
-                return example.result;
-            }) && item.examples.forEach(function(example) {
-                Ox.Log('TEST', example.statement);
-                var actual = eval(example.statement);
-                if (example.result) {
-                    tests.push({
-                        actual: JSON.stringify(actual),
-                        expected: example.result,
-                        name: item.name,
-                        section: item.section,
-                        statement: example.statement,
-                        passed: Ox.isEqual(eval(
-                            'Ox.test.result = ' + example.result
-                        ), actual)
-                    });
-                }
-            });
-        });
-        callback(tests);
-    });
-};
-
-/*@
-Ox.tokenize <f> Tokenizes JavaScript
-    (source) -> <[o]> Array of tokens
-        length <n> Length of the token
-        offset <n> Offset of the token
-        type <s> Type of the token
-            Type can be <code>"comment"</code>, <code>"constant"</code>,
-            <code>"identifier"</code>, <code>"keyword"</code>,
-            <code>"linebreak"</code>, <code>"method"</code>,
-            <code>"number"</code>, <code>"object"</code>, 
-            <code>"operator"</code>, <code>"property"</code>,
-            <code>"regexp"</code>, <code>"string"</code>
-            or <code>"whitespace"</code>
-    source <s> JavaScript source code
-@*/
-// FIXME: constant/method/object/property is of interest
-// for syntax highlighting, but may not belong here
-// FIXME: backport python version
-// FIXME: numbers (hex, exp, etc.)
-Ox.tokenize = (function() {
-
-    // see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js
-    // and https://developer.mozilla.org/en/JavaScript/Reference
-
-    var identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_',
-        linebreak = '\n\r',
-        number = '0123456789',
-        operator = [
-            // arithmetic
-            '+', '-', '*', '/', '%', '++', '--',
-            // assignment
-            '=', '+=', '-=', '*=', '/=', '%=',
-            '&=', '|=', '^=', '<<=', '>>=', '>>>=',
-            // bitwise
-            '&', '|', '^', '~', '<<', '>>', '>>>',
-            // comparison
-            '==', '!=', '===', '!==', '>', '>=', '<', '<=',
-            // conditional
-            '?', ':',
-            // grouping
-            '(', ')', '[', ']', '{', '}',
-            // logical
-            '&&', '||', '!',
-            // other
-            '.', ',', ';'
-        ],
-        regexp = 'abcdefghijklmnopqrstuvwxyz',
-        string = '\'"',
-        whitespace = ' \t',
-        word = {
+Ox.identify = (function() {
+    // see https://developer.mozilla.org/en/JavaScript/Reference
+    var identifiers = {
        constant: [
            // Math
            'E', 'LN2', 'LN10', 'LOG2E', 'LOG10E', 'PI', 'SQRT1_2', 'SQRT2',
            // Number
            'MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'POSITIVE_INFINITY'
        ],
-            keyword: [
-                'break',
-                'case', 'catch', 'class', 'const', 'continue',
-                'debugger', 'default', 'delete', 'do',
-                'else', 'enum', 'export', 'extends',
-                'false', 'finally', 'for', 'function',
-                'if', 'implements', 'import', 'in', 'instanceof', 'interface',
-                'let', 'module',
-                'new', 'null',
-                'package', 'private', 'protected', 'public',
-                'return',
-                'super', 'switch', 'static',
-                'this', 'throw', 'true', 'try', 'typeof',
-                'var', 'void',
-                'yield',
-                'while', 'with',
-            ],
        method: [
            // Array
            'concat',
@ -580,7 +387,8 @@ Ox.tokenize = (function() {
            'match',
            'replace',
            'search', 'slice', 'split', 'substr', 'substring',
-                'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toUpperCase', 'trim',
+            'toLocaleLowerCase', 'toLocaleUpperCase',
+            'toLowerCase', 'toUpperCase', 'trim',
            // Window
            'addEventListener', 'alert', 'atob',
            'blur', 'btoa',
@ -637,137 +445,279 @@ Ox.tokenize = (function() {
            'toolbar', 'top'
        ]
    };
-
-    return function(source) {
-
-        source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
-
-        var cursor = 0,
-            tokenize = {
-                comment: function() {
-                    while (char = source[++cursor]) {
-                        if (next == '/' && char == '\n') {
-                            break;
-                        } else if (next == '*' && char + source[cursor + 1] == '*/') {
-                            cursor += 2;
-                            break;
-                        }
-                    }
-                },
-                identifier: function() {
-                    var str;
-                    while ((identifier + number).indexOf(source[++cursor]) > -1) {}
-                    str = source.slice(start, cursor);
-                    Ox.forEach(word, function(value, key) {
-                        if (value.indexOf(str) > -1) {
-                            type = key;
+    return function(identifier) {
+        var ret;
+        if (Ox.KEYWORDS.indexOf(identifier) > -1) {
+            ret = 'keyword'
+        } else {
+            ret = 'identifier'
+            Ox.forEach(identifiers, function(words, type) {
+                if (words.indexOf(identifier) > -1) {
+                    ret = type;
                    Ox.Break();
                }
            });
-                },
-                linebreak: function() {
-                    while (linebreak.indexOf(source[++cursor]) > -1) {}
-                },
-                number: function() {
-                    while ((number + '.').indexOf(source[++cursor]) > -1) {}
-                },
-                operator: function() {
-                    while (operator.indexOf(char += source[++cursor]) > -1) {}
-                },
-                regexp: function() {
-                    while ((char = source[++cursor]) != '/') {
-                        char == '\\' && ++cursor;
-                        if (cursor == source.length) {
-                            break;
        }
-                    }
-                    while (regexp.indexOf(source[++cursor]) > -1) {}
-                },
-                string: function() {
-                    var delimiter = char;
-                    while ((char = source[++cursor]) != delimiter) {
-                        char == '\\' && ++cursor;
-                        if (cursor == source.length) {
-                            break;
-                        }
-                    }
-                    ++cursor;
-                },
-                whitespace: function() {
-                    while (whitespace.indexOf(source[++cursor]) > -1) {}
-                }
-            },
-            tokens = [],
-            type;
+        return ret;
+    };
+})();

-        while (cursor < source.length) {
-            var char = source[cursor],
-                next = source[cursor + 1],
-                start = cursor;
-            if (char == '/' && (next == '/' || next == '*')) {
-                type = 'comment';
-            } else if (identifier.indexOf(char) > -1) {
-                type = 'identifier';
-            } else if (linebreak.indexOf(char) > -1) {
-                type = 'linebreak';
-            } else if (number.indexOf(char) > -1) {
-                type = 'number';
-            } else if (string.indexOf(char) > -1) {
-                type = 'string';
-            } else if (whitespace.indexOf(char) > -1) {
-                type = 'whitespace';
-            } else if (char == '/') {
-                type = isRegExp() ? 'regexp' : 'operator';
-            } else if (operator.indexOf(char) > -1) {
-                type = 'operator';
-            }
-            tokenize[type]();
-            tokens.push({
-                length: cursor - start,
-                offset: start,
-                type: type
+/*@
+Ox.minify <f> Minifies JavaScript
+    (source) -> <s> Minified JavaScript
+    (file, callback) -> <u> undefined
+    source <s> JavaScript source
+    file <s> JavaScript file
+    callback <f> Callback function
+    > Ox.minify('for (a in b)\n{\t\tc = void 0;\n}')
+    'for(a in b)\n{c=void 0;}'
+    > Ox.minify('return a; return 0; return "";')
+    'return a;return 0;return"";'
+    > Ox.minify('return\na;\nreturn\n0;\nreturn\n"";')
+    'return\na;return\n0;return\n"";'
+@*/
+Ox.minify = function() {
+    // see https://github.com/douglascrockford/JSMin/blob/master/README
+    // and http://inimino.org/~inimino/blog/javascript_semicolons
+    if (arguments.length == 1) {
+        return minify(arguments[0]);
+    } else {
+        Ox.get(arguments[0], function(source) {
+            arguments[1](minify(source));
        });
    }
-
-        function isRegExp() {
-            // checks if a forward slash is the beginning of a regexp,
-            // as opposed to the beginning of an operator
-            // see http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html#regular-expressions
-            var index = tokens.length,
-                isRegExp = false,
-                offset = 0,
-                prevToken,
-                prevString;
-            // scan back to the previous significant token,
-            // or the beginning of the source
+    function minify(source) {
+        var tokens = Ox.tokenize(source),
+            length = tokens.length,
+            ret = '';
+        tokens.forEach(function(token, i) {
+            var next, nextToken, prevToken;
+            if (['linebreak', 'whitespace'].indexOf(token.type) > -1) {
+                prevToken = i == 0 ? null : tokens[i - 1];
+                next = i + 1;
                while (
-                tokens[--index] !== void 0 && [
-                    'comment', 'linebreak', 'whitespace'
-                ].indexOf(tokens[index].type) > -1
+                    next < length && ['comment', 'linebreak', 'whitespace']
+                        .indexOf(tokens[next].type) > -1
                ) {
-                offset += tokens[index].length;
+                    next++;
                }
-            if (typeof tokens[index] == 'undefined') {
-                // source begins with forward slash
+                nextToken = next == length ? null : tokens[next];
+            }
+            if (token.type == 'linebreak') {
+                // replace a linebreak between two tokens that are identifiers
+                // or numbers or strings or unary operators or grouping
+                // operators with a single newline, otherwise remove it
+                if (
+                    prevToken && nextToken && (
+                        ['identifier', 'number', 'string'].indexOf(prevToken.type) > -1
+                        || ['++', '--', ')', ']', '}'].indexOf(prevToken.value) > -1
+                    ) && (
+                        ['identifier', 'number', 'string'].indexOf(nextToken.type) > -1
+                        || ['+', '-', '++', '--', '~', '!', '(', '[', '{'].indexOf(nextToken.value) > -1
+                    )
+                ) {
+                    ret += '\n';
+                }
+            } else if (token.type == 'whitespace') {
+                // replace whitespace between two tokens that are identifiers or
+                // numbers, or between a token that ends with "+" or "-" and one
+                // that begins with "+" or "-", with a single space, otherwise
+                // remove it
+                if (
+                    prevToken && nextToken && ((
+                        ['identifier', 'number'].indexOf(prevToken.type) > -1
+                        && ['identifier', 'number'].indexOf(nextToken.type) > -1
+                    ) || (
+                        ['+', '-', '++', '--'].indexOf(prevToken.value) > -1
+                        && ['+', '-', '++', '--'].indexOf(nextToken.value) > -1
+                    ))
+                ) {
+                    ret += ' ';
+                }
+            } else if (token.type != 'comment') {
+                // remove comments and leave all other tokens untouched
+                ret += token.value;
+            }
+        });
+        return ret;
+    }
+};
+
+/*@
+Ox.test <f> Takes JavaScript, runs inline tests, returns results
+@*/
+Ox.test = function(file, callback) {
+    Ox.doc(file, function(items) {
+        var tests = [];
+        items.forEach(function(item) {
+            item.examples && item.examples.some(function(example) {
+                return example.result;
+            }) && item.examples.forEach(function(example) {
+                Ox.Log('TEST', example.statement);
+                var actual = eval(example.statement);
+                if (example.result) {
+                    tests.push({
+                        actual: JSON.stringify(actual),
+                        expected: example.result,
+                        name: item.name,
+                        section: item.section,
+                        statement: example.statement,
+                        passed: Ox.isEqual(eval(
+                            'Ox.test.result = ' + example.result
+                        ), actual)
+                    });
+                }
+            });
+        });
+        callback(tests);
+    });
+};
+
+/*@
+Ox.tokenize <f> Tokenizes JavaScript
+    (source) -> <[o]> Array of tokens
+        column <n> Column of the token
+        line <n> Line of the token
+        type <s> Type of the token
+            Type can be <code>"comment"</code>, <code>"identifier"</code>,
+            <code>"linebreak"</code>, <code>"number"</code>,
+            <code>"operator"</code>, <code>"regexp"</code>,
+            <code>"string"</code> or <code>"whitespace"</code>
+        value <s> Value of the token
+    source <s> JavaScript source code
+@*/
+// FIXME: numbers (hex, exp, etc.)
+Ox.tokenize = (function() {
+
+    // see https://github.com/mozilla/narcissus/blob/master/lib/lexer.js
+
+    var comment = ['//', '/*'],
+        identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_',
+        linebreak = '\n\r',
+        number = '0123456789',
+        operator = [
+            // arithmetic
+            '+', '-', '*', '/', '%', '++', '--',
+            // assignment
+            '=', '+=', '-=', '*=', '/=', '%=',
+            '&=', '|=', '^=', '<<=', '>>=', '>>>=',
+            // bitwise
+            '&', '|', '^', '~', '<<', '>>', '>>>',
+            // comparison
+            '==', '!=', '===', '!==', '>', '>=', '<', '<=',
+            // conditional
+            '?', ':',
+            // grouping
+            '(', ')', '[', ']', '{', '}',
+            // logical
+            '&&', '||', '!',
+            // other
+            '.', ',', ';'
+        ],
+        regexp = 'abcdefghijklmnopqrstuvwxyz',
+        string = '\'"',
+        whitespace = ' \t';
+
+    function isRegExp(tokens) {
+        // Returns true if the current token is the beginning of a RegExp, as
+        // opposed to the beginning of an operator
+        var i = tokens.length - 1, isRegExp, token
+        // Scan back to the previous significant token, or to the beginning of
+        // the source
+        while (i >= 0 && [
+            'comment', 'linebreak', 'whitespace'
+        ].indexOf(tokens[i].type) > -1) {
+            i--;
+        }
+        if (i == -1) {
+            // Source begins with a forward slash
            isRegExp = true;
        } else {
-                prevToken = tokens[index];
-                prevString = source.substr(
-                    cursor - prevToken.length - offset, prevToken.length
-                );
+            token = tokens[i];
            isRegExp = (
-                    prevToken.type == 'keyword'
-                    && ['false', 'null', 'true'].indexOf(prevString) == -1
+                token.type == 'identifier'
+                && Ox.identify(token.value) == 'keyword'
+                && ['false', 'null', 'true'].indexOf(token.value) == -1
            ) || (
-                    prevToken.type == 'operator'
-                    && ['++', '--', ')', ']', '}'].indexOf(prevString) == -1
-                );
+                token.type == 'operator'
+                && ['++', '--', ')', ']', '}'].indexOf(token.value) == -1
+            )
        }
        return isRegExp;
    }

+    return function(source) {
+        var char,
+            column = 0,
+            cursor = 0,
+            delimiter,
+            length = source.length,
+            line = 0,
+            lines,
+            next,
+            tokens = [],
+            start,
+            type,
+            value;
+        source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
+        while (cursor < length) {
+            start = cursor;
+            char = source[cursor];
+            if (comment.indexOf(delimiter = char + source[cursor + 1]) > -1) {
+                type = 'comment';
+                ++cursor;
+                while (char = source[++cursor]) {
+                    if (delimiter == '//' && char == '\n') {
+                        break;
+                    } else if (delimiter == '/*' && char + source[cursor + 1] == '*/') {
+                        cursor += 2;
+                        break;
+                    }
+                }
+            } else if (identifier.indexOf(char) > -1) {
+                type = 'identifier';
+                while ((identifier + number).indexOf(source[++cursor]) > -1) {}
+            } else if (linebreak.indexOf(char) > -1) {
+                type = 'linebreak';
+                while (linebreak.indexOf(source[++cursor]) > -1) {}
+            } else if (number.indexOf(char) > -1) {
+                type = 'number';
+                while ((number + '.').indexOf(source[++cursor]) > -1) {}
+            } else if (char == '/' && isRegExp(tokens)) {
+                type = 'regexp';
+                while ((char = source[++cursor]) != '/' && cursor < length) {
+                    char == '\\' && ++cursor;
+                }
+                while (regexp.indexOf(source[++cursor]) > -1) {}
+            } else if (operator.indexOf(char) > -1) {
+                type = 'operator';
+                while (operator.indexOf(char += source[++cursor]) > -1 && cursor < length) {}
+            } else if (string.indexOf(delimiter = char) > -1) {
+                type = 'string';
+                while ((char = source[++cursor]) != delimiter && cursor < length) {
+                    char == '\\' && ++cursor;
+                }
+                ++cursor;
+            } else if (whitespace.indexOf(char) > -1) {
+                type = 'whitespace';
+                while (whitespace.indexOf(source[++cursor]) > -1) {}
+            } else {
+                break;
+            }
+            value = source.slice(start, cursor);
+            tokens.push({column: column, line: line, type: type, value: value});
+            if (type == 'comment') {
+                lines = value.split('\n');
+                column = lines[lines.length - 1].length;
+                line += lines.length - 1;
+            } else if (type == 'linebreak') {
+                column = 0;
+                line += value.length;
+            } else {
+                column += value.length;
+            }
+        }
        return tokens;
-
    };

 }());