improve Ox.tokenize, add Ox.identify

2012-05-26 12:54:52 +02:00 · 2012-05-26 12:54:52 +02:00 · 5a2ecca0f1
commit 5a2ecca0f1
parent 08aedd34fd
1 changed files with 275 additions and 325 deletions
--- a/source/Ox/js/JavaScript.js
+++ b/source/Ox/js/JavaScript.js
@ -134,10 +134,9 @@ Ox.doc = (function() {
            tokens = [];
        Ox.tokenize(source).forEach(function(token) {
            var match;
            token.source = source.substr(token.offset, token.length);
            if (token.type == 'comment' && (
-                match = re.multiline.exec(token.source)
+                match = re.multiline.exec(token.value)
-                || re.singleline.exec(token.source)
+                || re.singleline.exec(token.value)
            )) {
                blocks.push(match[1]);
                tokens.push([]);
@ -162,7 +161,9 @@ Ox.doc = (function() {
                if (/^[A-Z]/.test(item.name)) {
                    // main item
                    // include leading whitespace
-                    item.source = parseTokens(tokens[i]);
+                    item.source = parseTokens(tokens[i]).map(function(token) {
                        return token.value;
                    }).join('');
                    item.line = source.slice(0, item.source[0].offset)
                        .split('\n').length;
                    items.push(item);
@ -304,215 +305,21 @@ Ox.doc = (function() {
 }());
 /*@
-Ox.minify <f> Minifies JavaScript
+Ox.identify <f> Returns the type of a JavaScript identifier
-    (source) -> <s> Minified JavaScript
+    (str) -> <s> Type
-    (file, callback) -> <u> undefined
+        Type can be <code>constant</code>, <code>identifier</code>,
-    source <s> JavaScript source
+        <code>keyword</code>, <code>method</code>, <code>object</code> or
-    file <s> JavaScript file
+        <code>property</code>
    callback <f> Callback function
    > Ox.minify('for (a in b)\n{\t\tc = void 0;\n}')
    'for(a in b)\n{c=void 0;}'
    > Ox.minify('return a; return 0; return "";')
    'return a;return 0;return"";'
    > Ox.minify('return\na;\nreturn\n0;\nreturn\n"";')
    'return\na;return\n0;return\n"";'
@*/
-Ox.minify = function() {
+Ox.identify = (function() {
-    // see https://github.com/douglascrockford/JSMin/blob/master/README
+    // see https://developer.mozilla.org/en/JavaScript/Reference
-    // and http://inimino.org/~inimino/blog/javascript_semicolons
+    var identifiers = {
    if (arguments.length == 1) {
        return minify(arguments[0]);
    } else {
        Ox.get(arguments[0], function(source) {
            arguments[1](minify(source));
        });
    }
    function isCommentOrLinebreakOrWhitespace(token) {
        return token.type == 'comment' || isLinebreakOrWhitespace(token);
    }
    function isIdentifierOrNumber(token) {
        return Ox.contains([
            'constant', 'identifier', 'keyword',
            'number', 'method', 'object', 'property'
        ], token.type);
    }
    function isIdentifierOrNumberOrString(token) {
        return isIdentifierOrNumber(token) || token.type == 'string';
    }
    function isLinebreakOrWhitespace(token) {
        return Ox.contains(['linebreak', 'whitespace'], token.type);
    }
    function minify(source) {
        var tokens = Ox.tokenize(source),
            length = tokens.length,
            ret = '';
        function getValue(token) {
            return source.substr(token.offset, token.length);
        }
        tokens.forEach(function(token, i) {
            var next, nextToken, previousToken;
            if (isLinebreakOrWhitespace(token)) {
                previousToken = i == 0 ? null : tokens[i - 1];
                next = i + 1;
                while (
                    next < length
                    && isCommentOrLinebreakOrWhitespace(tokens[next])
                ) {
                    next++;
                }
                nextToken = next == length ? null : tokens[next];
            }
            if (token.type == 'linebreak') {
                // replace a linebreak between two tokens that are identifiers
                // or numbers or strings or unary operators or grouping
                // operators with a single newline, otherwise remove it
                if (
                    previousToken && nextToken && (
                        isIdentifierOrNumberOrString(previousToken)
                        || Ox.contains([
                            '++', '--', ')', ']', '}'
                        ], getValue(previousToken))
                    ) && (
                        isIdentifierOrNumberOrString(nextToken)
                        || Ox.contains([
                            '+', '-', '++', '--', '~', '!', '(', '[', '{'
                        ], getValue(nextToken))
                    )
                ) {
                    ret += '\n';
                }
            } else if (token.type == 'whitespace') {
                // replace whitespace between two tokens that are identifiers or
                // numbers, or between a token that ends with "+" or "-" and one
                // that begins with "+" or "-", with a single space, otherwise
                // remove it
                if (
                    previousToken && nextToken && ((
                        isIdentifierOrNumber(previousToken)
                        && isIdentifierOrNumber(nextToken)
                    ) || (
                        Ox.contains([
                           '+', '-', '++', '--'
                        ], getValue(previousToken))
                        && Ox.contains([
                           '+', '-', '++', '--'
                        ], getValue(nextToken))
                    ))
                ) {
                    ret += ' ';
                }
            } else if (token.type != 'comment') {
                // remove comments and leave all other tokens untouched
                ret += getValue(token);
            }
        });
        return ret;
    }
 };
 /*@
 Ox.test <f> Takes JavaScript, runs inline tests, returns results
@*/
 Ox.test = function(file, callback) {
    Ox.doc(file, function(items) {
        var tests = [];
        items.forEach(function(item) {
            item.examples && item.examples.some(function(example) {
                return example.result;
            }) && item.examples.forEach(function(example) {
                Ox.Log('TEST', example.statement);
                var actual = eval(example.statement);
                if (example.result) {
                    tests.push({
                        actual: JSON.stringify(actual),
                        expected: example.result,
                        name: item.name,
                        section: item.section,
                        statement: example.statement,
                        passed: Ox.isEqual(eval(
                            'Ox.test.result = ' + example.result
                        ), actual)
                    });
                }
            });
        });
        callback(tests);
    });
 };
 /*@
 Ox.tokenize <f> Tokenizes JavaScript
    (source) -> <[o]> Array of tokens
        length <n> Length of the token
        offset <n> Offset of the token
        type <s> Type of the token
            Type can be <code>"comment"</code>, <code>"constant"</code>,
            <code>"identifier"</code>, <code>"keyword"</code>,
            <code>"linebreak"</code>, <code>"method"</code>,
            <code>"number"</code>, <code>"object"</code>, 
            <code>"operator"</code>, <code>"property"</code>,
            <code>"regexp"</code>, <code>"string"</code>
            or <code>"whitespace"</code>
    source <s> JavaScript source code
@*/
 // FIXME: constant/method/object/property is of interest
 // for syntax highlighting, but may not belong here
 // FIXME: backport python version
 // FIXME: numbers (hex, exp, etc.)
 Ox.tokenize = (function() {
    // see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js
    // and https://developer.mozilla.org/en/JavaScript/Reference
    var identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_',
        linebreak = '\n\r',
        number = '0123456789',
        operator = [
            // arithmetic
            '+', '-', '*', '/', '%', '++', '--',
            // assignment
            '=', '+=', '-=', '*=', '/=', '%=',
            '&=', '|=', '^=', '<<=', '>>=', '>>>=',
            // bitwise
            '&', '|', '^', '~', '<<', '>>', '>>>',
            // comparison
            '==', '!=', '===', '!==', '>', '>=', '<', '<=',
            // conditional
            '?', ':',
            // grouping
            '(', ')', '[', ']', '{', '}',
            // logical
            '&&', '||', '!',
            // other
            '.', ',', ';'
        ],
        regexp = 'abcdefghijklmnopqrstuvwxyz',
        string = '\'"',
        whitespace = ' \t',
        word = {
        constant: [
            // Math
            'E', 'LN2', 'LN10', 'LOG2E', 'LOG10E', 'PI', 'SQRT1_2', 'SQRT2',
            // Number
            'MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'POSITIVE_INFINITY'
        ],
            keyword: [
                'break',
                'case', 'catch', 'class', 'const', 'continue',
                'debugger', 'default', 'delete', 'do',
                'else', 'enum', 'export', 'extends',
                'false', 'finally', 'for', 'function',
                'if', 'implements', 'import', 'in', 'instanceof', 'interface',
                'let', 'module',
                'new', 'null',
                'package', 'private', 'protected', 'public',
                'return',
                'super', 'switch', 'static',
                'this', 'throw', 'true', 'try', 'typeof',
                'var', 'void',
                'yield',
                'while', 'with',
            ],
        method: [
            // Array
            'concat',
@ -580,7 +387,8 @@ Ox.tokenize = (function() {
            'match',
            'replace',
            'search', 'slice', 'split', 'substr', 'substring',
-                'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toUpperCase', 'trim',
+            'toLocaleLowerCase', 'toLocaleUpperCase',
            'toLowerCase', 'toUpperCase', 'trim',
            // Window
            'addEventListener', 'alert', 'atob',
            'blur', 'btoa',
@ -637,137 +445,279 @@ Ox.tokenize = (function() {
            'toolbar', 'top'
        ]
    };
-
+    return function(identifier) {
-    return function(source) {
+        var ret;
-
+        if (Ox.KEYWORDS.indexOf(identifier) > -1) {
-        source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
+            ret = 'keyword'
-
+        } else {
-        var cursor = 0,
+            ret = 'identifier'
-            tokenize = {
+            Ox.forEach(identifiers, function(words, type) {
-                comment: function() {
+                if (words.indexOf(identifier) > -1) {
-                    while (char = source[++cursor]) {
+                    ret = type;
                        if (next == '/' && char == '\n') {
                            break;
                        } else if (next == '*' && char + source[cursor + 1] == '*/') {
                            cursor += 2;
                            break;
                        }
                    }
                },
                identifier: function() {
                    var str;
                    while ((identifier + number).indexOf(source[++cursor]) > -1) {}
                    str = source.slice(start, cursor);
                    Ox.forEach(word, function(value, key) {
                        if (value.indexOf(str) > -1) {
                            type = key;
                    Ox.Break();
                }
            });
                },
                linebreak: function() {
                    while (linebreak.indexOf(source[++cursor]) > -1) {}
                },
                number: function() {
                    while ((number + '.').indexOf(source[++cursor]) > -1) {}
                },
                operator: function() {
                    while (operator.indexOf(char += source[++cursor]) > -1) {}
                },
                regexp: function() {
                    while ((char = source[++cursor]) != '/') {
                        char == '\\' && ++cursor;
                        if (cursor == source.length) {
                            break;
        }
-                    }
+        return ret;
-                    while (regexp.indexOf(source[++cursor]) > -1) {}
+    };
-                },
+})();
                string: function() {
                    var delimiter = char;
                    while ((char = source[++cursor]) != delimiter) {
                        char == '\\' && ++cursor;
                        if (cursor == source.length) {
                            break;
                        }
                    }
                    ++cursor;
                },
                whitespace: function() {
                    while (whitespace.indexOf(source[++cursor]) > -1) {}
                }
            },
            tokens = [],
            type;
-        while (cursor < source.length) {
+/*@
-            var char = source[cursor],
+Ox.minify <f> Minifies JavaScript
-                next = source[cursor + 1],
+    (source) -> <s> Minified JavaScript
-                start = cursor;
+    (file, callback) -> <u> undefined
-            if (char == '/' && (next == '/' || next == '*')) {
+    source <s> JavaScript source
-                type = 'comment';
+    file <s> JavaScript file
-            } else if (identifier.indexOf(char) > -1) {
+    callback <f> Callback function
-                type = 'identifier';
+    > Ox.minify('for (a in b)\n{\t\tc = void 0;\n}')
-            } else if (linebreak.indexOf(char) > -1) {
+    'for(a in b)\n{c=void 0;}'
-                type = 'linebreak';
+    > Ox.minify('return a; return 0; return "";')
-            } else if (number.indexOf(char) > -1) {
+    'return a;return 0;return"";'
-                type = 'number';
+    > Ox.minify('return\na;\nreturn\n0;\nreturn\n"";')
-            } else if (string.indexOf(char) > -1) {
+    'return\na;return\n0;return\n"";'
-                type = 'string';
+@*/
-            } else if (whitespace.indexOf(char) > -1) {
+Ox.minify = function() {
-                type = 'whitespace';
+    // see https://github.com/douglascrockford/JSMin/blob/master/README
-            } else if (char == '/') {
+    // and http://inimino.org/~inimino/blog/javascript_semicolons
-                type = isRegExp() ? 'regexp' : 'operator';
+    if (arguments.length == 1) {
-            } else if (operator.indexOf(char) > -1) {
+        return minify(arguments[0]);
-                type = 'operator';
+    } else {
-            }
+        Ox.get(arguments[0], function(source) {
-            tokenize[type]();
+            arguments[1](minify(source));
            tokens.push({
                length: cursor - start,
                offset: start,
                type: type
        });
    }
-
+    function minify(source) {
-        function isRegExp() {
+        var tokens = Ox.tokenize(source),
-            // checks if a forward slash is the beginning of a regexp,
+            length = tokens.length,
-            // as opposed to the beginning of an operator
+            ret = '';
-            // see http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html#regular-expressions
+        tokens.forEach(function(token, i) {
-            var index = tokens.length,
+            var next, nextToken, prevToken;
-                isRegExp = false,
+            if (['linebreak', 'whitespace'].indexOf(token.type) > -1) {
-                offset = 0,
+                prevToken = i == 0 ? null : tokens[i - 1];
-                prevToken,
+                next = i + 1;
                prevString;
            // scan back to the previous significant token,
            // or the beginning of the source
                while (
-                tokens[--index] !== void 0 && [
+                    next < length && ['comment', 'linebreak', 'whitespace']
-                    'comment', 'linebreak', 'whitespace'
+                        .indexOf(tokens[next].type) > -1
                ].indexOf(tokens[index].type) > -1
                ) {
-                offset += tokens[index].length;
+                    next++;
                }
-            if (typeof tokens[index] == 'undefined') {
+                nextToken = next == length ? null : tokens[next];
-                // source begins with forward slash
+            }
            if (token.type == 'linebreak') {
                // replace a linebreak between two tokens that are identifiers
                // or numbers or strings or unary operators or grouping
                // operators with a single newline, otherwise remove it
                if (
                    prevToken && nextToken && (
                        ['identifier', 'number', 'string'].indexOf(prevToken.type) > -1
                        || ['++', '--', ')', ']', '}'].indexOf(prevToken.value) > -1
                    ) && (
                        ['identifier', 'number', 'string'].indexOf(nextToken.type) > -1
                        || ['+', '-', '++', '--', '~', '!', '(', '[', '{'].indexOf(nextToken.value) > -1
                    )
                ) {
                    ret += '\n';
                }
            } else if (token.type == 'whitespace') {
                // replace whitespace between two tokens that are identifiers or
                // numbers, or between a token that ends with "+" or "-" and one
                // that begins with "+" or "-", with a single space, otherwise
                // remove it
                if (
                    prevToken && nextToken && ((
                        ['identifier', 'number'].indexOf(prevToken.type) > -1
                        && ['identifier', 'number'].indexOf(nextToken.type) > -1
                    ) || (
                        ['+', '-', '++', '--'].indexOf(prevToken.value) > -1
                        && ['+', '-', '++', '--'].indexOf(nextToken.value) > -1
                    ))
                ) {
                    ret += ' ';
                }
            } else if (token.type != 'comment') {
                // remove comments and leave all other tokens untouched
                ret += token.value;
            }
        });
        return ret;
    }
 };
 /*@
 Ox.test <f> Takes JavaScript, runs inline tests, returns results
@*/
 Ox.test = function(file, callback) {
    Ox.doc(file, function(items) {
        var tests = [];
        items.forEach(function(item) {
            item.examples && item.examples.some(function(example) {
                return example.result;
            }) && item.examples.forEach(function(example) {
                Ox.Log('TEST', example.statement);
                var actual = eval(example.statement);
                if (example.result) {
                    tests.push({
                        actual: JSON.stringify(actual),
                        expected: example.result,
                        name: item.name,
                        section: item.section,
                        statement: example.statement,
                        passed: Ox.isEqual(eval(
                            'Ox.test.result = ' + example.result
                        ), actual)
                    });
                }
            });
        });
        callback(tests);
    });
 };
 /*@
 Ox.tokenize <f> Tokenizes JavaScript
    (source) -> <[o]> Array of tokens
        column <n> Column of the token
        line <n> Line of the token
        type <s> Type of the token
            Type can be <code>"comment"</code>, <code>"identifier"</code>,
            <code>"linebreak"</code>, <code>"number"</code>,
            <code>"operator"</code>, <code>"regexp"</code>,
            <code>"string"</code> or <code>"whitespace"</code>
        value <s> Value of the token
    source <s> JavaScript source code
@*/
 // FIXME: numbers (hex, exp, etc.)
 Ox.tokenize = (function() {
    // see https://github.com/mozilla/narcissus/blob/master/lib/lexer.js
    var comment = ['//', '/*'],
        identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_',
        linebreak = '\n\r',
        number = '0123456789',
        operator = [
            // arithmetic
            '+', '-', '*', '/', '%', '++', '--',
            // assignment
            '=', '+=', '-=', '*=', '/=', '%=',
            '&=', '|=', '^=', '<<=', '>>=', '>>>=',
            // bitwise
            '&', '|', '^', '~', '<<', '>>', '>>>',
            // comparison
            '==', '!=', '===', '!==', '>', '>=', '<', '<=',
            // conditional
            '?', ':',
            // grouping
            '(', ')', '[', ']', '{', '}',
            // logical
            '&&', '||', '!',
            // other
            '.', ',', ';'
        ],
        regexp = 'abcdefghijklmnopqrstuvwxyz',
        string = '\'"',
        whitespace = ' \t';
    function isRegExp(tokens) {
        // Returns true if the current token is the beginning of a RegExp, as
        // opposed to the beginning of an operator
        var i = tokens.length - 1, isRegExp, token
        // Scan back to the previous significant token, or to the beginning of
        // the source
        while (i >= 0 && [
            'comment', 'linebreak', 'whitespace'
        ].indexOf(tokens[i].type) > -1) {
            i--;
        }
        if (i == -1) {
            // Source begins with a forward slash
            isRegExp = true;
        } else {
-                prevToken = tokens[index];
+            token = tokens[i];
                prevString = source.substr(
                    cursor - prevToken.length - offset, prevToken.length
                );
            isRegExp = (
-                    prevToken.type == 'keyword'
+                token.type == 'identifier'
-                    && ['false', 'null', 'true'].indexOf(prevString) == -1
+                && Ox.identify(token.value) == 'keyword'
                && ['false', 'null', 'true'].indexOf(token.value) == -1
            ) || (
-                    prevToken.type == 'operator'
+                token.type == 'operator'
-                    && ['++', '--', ')', ']', '}'].indexOf(prevString) == -1
+                && ['++', '--', ')', ']', '}'].indexOf(token.value) == -1
-                );
+            )
        }
        return isRegExp;
    }
    return function(source) {
        var char,
            column = 0,
            cursor = 0,
            delimiter,
            length = source.length,
            line = 0,
            lines,
            next,
            tokens = [],
            start,
            type,
            value;
        source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
        while (cursor < length) {
            start = cursor;
            char = source[cursor];
            if (comment.indexOf(delimiter = char + source[cursor + 1]) > -1) {
                type = 'comment';
                ++cursor;
                while (char = source[++cursor]) {
                    if (delimiter == '//' && char == '\n') {
                        break;
                    } else if (delimiter == '/*' && char + source[cursor + 1] == '*/') {
                        cursor += 2;
                        break;
                    }
                }
            } else if (identifier.indexOf(char) > -1) {
                type = 'identifier';
                while ((identifier + number).indexOf(source[++cursor]) > -1) {}
            } else if (linebreak.indexOf(char) > -1) {
                type = 'linebreak';
                while (linebreak.indexOf(source[++cursor]) > -1) {}
            } else if (number.indexOf(char) > -1) {
                type = 'number';
                while ((number + '.').indexOf(source[++cursor]) > -1) {}
            } else if (char == '/' && isRegExp(tokens)) {
                type = 'regexp';
                while ((char = source[++cursor]) != '/' && cursor < length) {
                    char == '\\' && ++cursor;
                }
                while (regexp.indexOf(source[++cursor]) > -1) {}
            } else if (operator.indexOf(char) > -1) {
                type = 'operator';
                while (operator.indexOf(char += source[++cursor]) > -1 && cursor < length) {}
            } else if (string.indexOf(delimiter = char) > -1) {
                type = 'string';
                while ((char = source[++cursor]) != delimiter && cursor < length) {
                    char == '\\' && ++cursor;
                }
                ++cursor;
            } else if (whitespace.indexOf(char) > -1) {
                type = 'whitespace';
                while (whitespace.indexOf(source[++cursor]) > -1) {}
            } else {
                break;
            }
            value = source.slice(start, cursor);
            tokens.push({column: column, line: line, type: type, value: value});
            if (type == 'comment') {
                lines = value.split('\n');
                column = lines[lines.length - 1].length;
                line += lines.length - 1;
            } else if (type == 'linebreak') {
                column = 0;
                line += value.length;
            } else {
                column += value.length;
            }
        }
        return tokens;
    };
 }());