From 5a2ecca0f197cb10f74968e75246c34e4c3500d8 Mon Sep 17 00:00:00 2001 From: rolux Date: Sat, 26 May 2012 12:54:52 +0200 Subject: [PATCH] improve Ox.tokenize, add Ox.identify --- source/Ox/js/JavaScript.js | 600 +++++++++++++++++-------------------- 1 file changed, 275 insertions(+), 325 deletions(-) diff --git a/source/Ox/js/JavaScript.js b/source/Ox/js/JavaScript.js index 12891043..8bddd034 100644 --- a/source/Ox/js/JavaScript.js +++ b/source/Ox/js/JavaScript.js @@ -134,10 +134,9 @@ Ox.doc = (function() { tokens = []; Ox.tokenize(source).forEach(function(token) { var match; - token.source = source.substr(token.offset, token.length); if (token.type == 'comment' && ( - match = re.multiline.exec(token.source) - || re.singleline.exec(token.source) + match = re.multiline.exec(token.value) + || re.singleline.exec(token.value) )) { blocks.push(match[1]); tokens.push([]); @@ -162,7 +161,9 @@ Ox.doc = (function() { if (/^[A-Z]/.test(item.name)) { // main item // include leading whitespace - item.source = parseTokens(tokens[i]); + item.source = parseTokens(tokens[i]).map(function(token) { + return token.value; + }).join(''); item.line = source.slice(0, item.source[0].offset) .split('\n').length; items.push(item); @@ -303,6 +304,164 @@ Ox.doc = (function() { } }()); +/*@ +Ox.identify Returns the type of a JavaScript identifier + (str) -> Type + Type can be constant, identifier, + keyword, method, object or + property +@*/ +Ox.identify = (function() { + // see https://developer.mozilla.org/en/JavaScript/Reference + var identifiers = { + constant: [ + // Math + 'E', 'LN2', 'LN10', 'LOG2E', 'LOG10E', 'PI', 'SQRT1_2', 'SQRT2', + // Number + 'MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'POSITIVE_INFINITY' + ], + method: [ + // Array + 'concat', + 'every', + 'filter', 'forEach', + 'join', + 'lastIndexOf', + 'indexOf', 'isArray', + 'map', + 'pop', 'push', + 'reduce', 'reduceRight', 'reverse', + 'shift', 'slice', 'some', 'sort', 'splice', + 'unshift', + // Date + 'getDate', 'getDay', 'getFullYear', 'getHours', + 'getMilliseconds', 'getMinutes', 'getMonth', 'getSeconds', + 'getTime', 'getTimezoneOffset', + 'getUTCDate', 'getUTCDay', 'getUTCFullYear', 'getUTCHours', + 'getUTCMilliseconds', 'getUTCMinutes', 'getUTCMonth', 'getUTCSeconds', + 'now', + 'parse', + 'setDate', 'setFullYear', 'setHours', 'setMilliseconds', + 'setMinutes', 'setMonth', 'setSeconds', 'setTime', + 'setUTCDate', 'setUTCFullYear', 'setUTCHours', 'setUTCMilliseconds', + 'setUTCMinutes', 'setUTCMonth', 'setUTCSeconds', + 'toDateString', 'toJSON', 'toLocaleDateString', 'toLocaleString', + 'toLocaleTimeString', 'toTimeString', 'toUTCString', + 'UTC', + // Function + 'apply', 'bind', 'call', 'isGenerator', + // JSON + 'parse', 'stringify', + // Math + 'abs', 'acos', 'asin', 'atan', 'atan2', + 'ceil', 'cos', + 'exp', + 'floor', + 'log', + 'max', 'min', + 'pow', + 'random', 'round', + 'sin', 'sqrt', + 'tan', + // Number + 'toExponential', 'toFixed', 'toLocaleString', 'toPrecision', + // Object + 'create', + 'defineProperty', 'defineProperties', + 'freeze', + 'getOwnPropertyDescriptor', 'getOwnPropertyNames', 'getPrototypeOf', + 'hasOwnProperty', + 'isExtensible', 'isFrozen', 'isPrototypeOf', 'isSealed', + 'keys', + 'preventExtensions', 'propertyIsEnumerable', + 'seal', + 'toLocaleString', 'toString', + 'valueOf', + // RegExp + 'exec', 'test', + // String + 'charAt', 'charCodeAt', 'concat', + 'fromCharCode', + 'indexOf', + 'lastIndexOf', 'localeCompare', + 'match', + 'replace', + 'search', 'slice', 'split', 'substr', 'substring', + 'toLocaleLowerCase', 'toLocaleUpperCase', + 'toLowerCase', 'toUpperCase', 'trim', + // Window + 'addEventListener', 'alert', 'atob', + 'blur', 'btoa', + 'clearInterval', 'clearTimeout', 'close', 'confirm', + 'dispatchEvent', + 'escape', + 'find', 'focus', + 'getComputedStyle', 'getSelection', + 'moveBy', 'moveTo', + 'open', + 'postMessage', 'print', 'prompt', + 'removeEventListener', 'resizeBy', 'resizeTo', + 'scroll', 'scrollBy', 'scrollTo', + 'setCursor', 'setInterval', 'setTimeout', 'stop', + 'unescape' + ], + object: [ + 'Array', + 'Boolean', + 'Date', 'decodeURI', 'decodeURIComponent', + 'encodeURI', 'encodeURIComponent', 'Error', 'eval', 'EvalError', + 'Function', + 'Infinity', 'isFinite', 'isNaN', + 'JSON', + 'Math', + 'NaN', 'Number', + 'Object', + 'parseFloat', 'parseInt', + 'RangeError', 'ReferenceError', 'RegExp', + 'String', 'SyntaxError', + 'TypeError', + 'undefined', 'URIError', + 'window' + ], + property: [ + // Function + 'constructor', 'length', 'prototype', + // RegExp + 'global', 'ignoreCase', 'lastIndex', 'multiline', 'source', + // Window + 'applicationCache', + 'closed', 'console', 'content', 'crypto', + 'defaultStatus', 'document', + 'frameElement', 'frames', + 'history', + 'innerHeight', 'innerWidth', + 'length', 'location', 'locationbar', 'localStorage', + 'menubar', + 'name', 'navigator', + 'opener', 'outerHeight', 'outerWidth', + 'pageXOffset', 'pageYOffset', 'parent', 'personalbar', + 'screen', 'screenX', 'screenY', 'scrollbars', 'scrollX', 'scrollY', + 'self', 'sessionStorage', 'status', 'statusbar', + 'toolbar', 'top' + ] + }; + return function(identifier) { + var ret; + if (Ox.KEYWORDS.indexOf(identifier) > -1) { + ret = 'keyword' + } else { + ret = 'identifier' + Ox.forEach(identifiers, function(words, type) { + if (words.indexOf(identifier) > -1) { + ret = type; + Ox.Break(); + } + }); + } + return ret; + }; +})(); + /*@ Ox.minify Minifies JavaScript (source) -> Minified JavaScript @@ -327,36 +486,18 @@ Ox.minify = function() { arguments[1](minify(source)); }); } - function isCommentOrLinebreakOrWhitespace(token) { - return token.type == 'comment' || isLinebreakOrWhitespace(token); - } - function isIdentifierOrNumber(token) { - return Ox.contains([ - 'constant', 'identifier', 'keyword', - 'number', 'method', 'object', 'property' - ], token.type); - } - function isIdentifierOrNumberOrString(token) { - return isIdentifierOrNumber(token) || token.type == 'string'; - } - function isLinebreakOrWhitespace(token) { - return Ox.contains(['linebreak', 'whitespace'], token.type); - } function minify(source) { var tokens = Ox.tokenize(source), length = tokens.length, ret = ''; - function getValue(token) { - return source.substr(token.offset, token.length); - } tokens.forEach(function(token, i) { - var next, nextToken, previousToken; - if (isLinebreakOrWhitespace(token)) { - previousToken = i == 0 ? null : tokens[i - 1]; + var next, nextToken, prevToken; + if (['linebreak', 'whitespace'].indexOf(token.type) > -1) { + prevToken = i == 0 ? null : tokens[i - 1]; next = i + 1; while ( - next < length - && isCommentOrLinebreakOrWhitespace(tokens[next]) + next < length && ['comment', 'linebreak', 'whitespace'] + .indexOf(tokens[next].type) > -1 ) { next++; } @@ -367,16 +508,12 @@ Ox.minify = function() { // or numbers or strings or unary operators or grouping // operators with a single newline, otherwise remove it if ( - previousToken && nextToken && ( - isIdentifierOrNumberOrString(previousToken) - || Ox.contains([ - '++', '--', ')', ']', '}' - ], getValue(previousToken)) + prevToken && nextToken && ( + ['identifier', 'number', 'string'].indexOf(prevToken.type) > -1 + || ['++', '--', ')', ']', '}'].indexOf(prevToken.value) > -1 ) && ( - isIdentifierOrNumberOrString(nextToken) - || Ox.contains([ - '+', '-', '++', '--', '~', '!', '(', '[', '{' - ], getValue(nextToken)) + ['identifier', 'number', 'string'].indexOf(nextToken.type) > -1 + || ['+', '-', '++', '--', '~', '!', '(', '[', '{'].indexOf(nextToken.value) > -1 ) ) { ret += '\n'; @@ -387,23 +524,19 @@ Ox.minify = function() { // that begins with "+" or "-", with a single space, otherwise // remove it if ( - previousToken && nextToken && (( - isIdentifierOrNumber(previousToken) - && isIdentifierOrNumber(nextToken) + prevToken && nextToken && (( + ['identifier', 'number'].indexOf(prevToken.type) > -1 + && ['identifier', 'number'].indexOf(nextToken.type) > -1 ) || ( - Ox.contains([ - '+', '-', '++', '--' - ], getValue(previousToken)) - && Ox.contains([ - '+', '-', '++', '--' - ], getValue(nextToken)) + ['+', '-', '++', '--'].indexOf(prevToken.value) > -1 + && ['+', '-', '++', '--'].indexOf(nextToken.value) > -1 )) ) { ret += ' '; } } else if (token.type != 'comment') { // remove comments and leave all other tokens untouched - ret += getValue(token); + ret += token.value; } }); return ret; @@ -443,28 +576,23 @@ Ox.test = function(file, callback) { /*@ Ox.tokenize Tokenizes JavaScript (source) -> <[o]> Array of tokens - length Length of the token - offset Offset of the token + column Column of the token + line Line of the token type Type of the token - Type can be "comment", "constant", - "identifier", "keyword", - "linebreak", "method", - "number", "object", - "operator", "property", - "regexp", "string" - or "whitespace" + Type can be "comment", "identifier", + "linebreak", "number", + "operator", "regexp", + "string" or "whitespace" + value Value of the token source JavaScript source code @*/ -// FIXME: constant/method/object/property is of interest -// for syntax highlighting, but may not belong here -// FIXME: backport python version // FIXME: numbers (hex, exp, etc.) Ox.tokenize = (function() { - // see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js - // and https://developer.mozilla.org/en/JavaScript/Reference + // see https://github.com/mozilla/narcissus/blob/master/lib/lexer.js - var identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_', + var comment = ['//', '/*'], + identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_', linebreak = '\n\r', number = '0123456789', operator = [ @@ -488,286 +616,108 @@ Ox.tokenize = (function() { ], regexp = 'abcdefghijklmnopqrstuvwxyz', string = '\'"', - whitespace = ' \t', - word = { - constant: [ - // Math - 'E', 'LN2', 'LN10', 'LOG2E', 'LOG10E', 'PI', 'SQRT1_2', 'SQRT2', - // Number - 'MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'POSITIVE_INFINITY' - ], - keyword: [ - 'break', - 'case', 'catch', 'class', 'const', 'continue', - 'debugger', 'default', 'delete', 'do', - 'else', 'enum', 'export', 'extends', - 'false', 'finally', 'for', 'function', - 'if', 'implements', 'import', 'in', 'instanceof', 'interface', - 'let', 'module', - 'new', 'null', - 'package', 'private', 'protected', 'public', - 'return', - 'super', 'switch', 'static', - 'this', 'throw', 'true', 'try', 'typeof', - 'var', 'void', - 'yield', - 'while', 'with', - ], - method: [ - // Array - 'concat', - 'every', - 'filter', 'forEach', - 'join', - 'lastIndexOf', - 'indexOf', 'isArray', - 'map', - 'pop', 'push', - 'reduce', 'reduceRight', 'reverse', - 'shift', 'slice', 'some', 'sort', 'splice', - 'unshift', - // Date - 'getDate', 'getDay', 'getFullYear', 'getHours', - 'getMilliseconds', 'getMinutes', 'getMonth', 'getSeconds', - 'getTime', 'getTimezoneOffset', - 'getUTCDate', 'getUTCDay', 'getUTCFullYear', 'getUTCHours', - 'getUTCMilliseconds', 'getUTCMinutes', 'getUTCMonth', 'getUTCSeconds', - 'now', - 'parse', - 'setDate', 'setFullYear', 'setHours', 'setMilliseconds', - 'setMinutes', 'setMonth', 'setSeconds', 'setTime', - 'setUTCDate', 'setUTCFullYear', 'setUTCHours', 'setUTCMilliseconds', - 'setUTCMinutes', 'setUTCMonth', 'setUTCSeconds', - 'toDateString', 'toJSON', 'toLocaleDateString', 'toLocaleString', - 'toLocaleTimeString', 'toTimeString', 'toUTCString', - 'UTC', - // Function - 'apply', 'bind', 'call', 'isGenerator', - // JSON - 'parse', 'stringify', - // Math - 'abs', 'acos', 'asin', 'atan', 'atan2', - 'ceil', 'cos', - 'exp', - 'floor', - 'log', - 'max', 'min', - 'pow', - 'random', 'round', - 'sin', 'sqrt', - 'tan', - // Number - 'toExponential', 'toFixed', 'toLocaleString', 'toPrecision', - // Object - 'create', - 'defineProperty', 'defineProperties', - 'freeze', - 'getOwnPropertyDescriptor', 'getOwnPropertyNames', 'getPrototypeOf', - 'hasOwnProperty', - 'isExtensible', 'isFrozen', 'isPrototypeOf', 'isSealed', - 'keys', - 'preventExtensions', 'propertyIsEnumerable', - 'seal', - 'toLocaleString', 'toString', - 'valueOf', - // RegExp - 'exec', 'test', - // String - 'charAt', 'charCodeAt', 'concat', - 'fromCharCode', - 'indexOf', - 'lastIndexOf', 'localeCompare', - 'match', - 'replace', - 'search', 'slice', 'split', 'substr', 'substring', - 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toUpperCase', 'trim', - // Window - 'addEventListener', 'alert', 'atob', - 'blur', 'btoa', - 'clearInterval', 'clearTimeout', 'close', 'confirm', - 'dispatchEvent', - 'escape', - 'find', 'focus', - 'getComputedStyle', 'getSelection', - 'moveBy', 'moveTo', - 'open', - 'postMessage', 'print', 'prompt', - 'removeEventListener', 'resizeBy', 'resizeTo', - 'scroll', 'scrollBy', 'scrollTo', - 'setCursor', 'setInterval', 'setTimeout', 'stop', - 'unescape' - ], - object: [ - 'Array', - 'Boolean', - 'Date', 'decodeURI', 'decodeURIComponent', - 'encodeURI', 'encodeURIComponent', 'Error', 'eval', 'EvalError', - 'Function', - 'Infinity', 'isFinite', 'isNaN', - 'JSON', - 'Math', - 'NaN', 'Number', - 'Object', - 'parseFloat', 'parseInt', - 'RangeError', 'ReferenceError', 'RegExp', - 'String', 'SyntaxError', - 'TypeError', - 'undefined', 'URIError', - 'window' - ], - property: [ - // Function - 'constructor', 'length', 'prototype', - // RegExp - 'global', 'ignoreCase', 'lastIndex', 'multiline', 'source', - // Window - 'applicationCache', - 'closed', 'console', 'content', 'crypto', - 'defaultStatus', 'document', - 'frameElement', 'frames', - 'history', - 'innerHeight', 'innerWidth', - 'length', 'location', 'locationbar', 'localStorage', - 'menubar', - 'name', 'navigator', - 'opener', 'outerHeight', 'outerWidth', - 'pageXOffset', 'pageYOffset', 'parent', 'personalbar', - 'screen', 'screenX', 'screenY', 'scrollbars', 'scrollX', 'scrollY', - 'self', 'sessionStorage', 'status', 'statusbar', - 'toolbar', 'top' - ] - }; + whitespace = ' \t'; + + function isRegExp(tokens) { + // Returns true if the current token is the beginning of a RegExp, as + // opposed to the beginning of an operator + var i = tokens.length - 1, isRegExp, token + // Scan back to the previous significant token, or to the beginning of + // the source + while (i >= 0 && [ + 'comment', 'linebreak', 'whitespace' + ].indexOf(tokens[i].type) > -1) { + i--; + } + if (i == -1) { + // Source begins with a forward slash + isRegExp = true; + } else { + token = tokens[i]; + isRegExp = ( + token.type == 'identifier' + && Ox.identify(token.value) == 'keyword' + && ['false', 'null', 'true'].indexOf(token.value) == -1 + ) || ( + token.type == 'operator' + && ['++', '--', ')', ']', '}'].indexOf(token.value) == -1 + ) + } + return isRegExp; + } return function(source) { - - source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); - - var cursor = 0, - tokenize = { - comment: function() { - while (char = source[++cursor]) { - if (next == '/' && char == '\n') { - break; - } else if (next == '*' && char + source[cursor + 1] == '*/') { - cursor += 2; - break; - } - } - }, - identifier: function() { - var str; - while ((identifier + number).indexOf(source[++cursor]) > -1) {} - str = source.slice(start, cursor); - Ox.forEach(word, function(value, key) { - if (value.indexOf(str) > -1) { - type = key; - Ox.Break(); - } - }); - }, - linebreak: function() { - while (linebreak.indexOf(source[++cursor]) > -1) {} - }, - number: function() { - while ((number + '.').indexOf(source[++cursor]) > -1) {} - }, - operator: function() { - while (operator.indexOf(char += source[++cursor]) > -1) {} - }, - regexp: function() { - while ((char = source[++cursor]) != '/') { - char == '\\' && ++cursor; - if (cursor == source.length) { - break; - } - } - while (regexp.indexOf(source[++cursor]) > -1) {} - }, - string: function() { - var delimiter = char; - while ((char = source[++cursor]) != delimiter) { - char == '\\' && ++cursor; - if (cursor == source.length) { - break; - } - } - ++cursor; - }, - whitespace: function() { - while (whitespace.indexOf(source[++cursor]) > -1) {} - } - }, + var char, + column = 0, + cursor = 0, + delimiter, + length = source.length, + line = 0, + lines, + next, tokens = [], - type; - - while (cursor < source.length) { - var char = source[cursor], - next = source[cursor + 1], - start = cursor; - if (char == '/' && (next == '/' || next == '*')) { + start, + type, + value; + source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); + while (cursor < length) { + start = cursor; + char = source[cursor]; + if (comment.indexOf(delimiter = char + source[cursor + 1]) > -1) { type = 'comment'; + ++cursor; + while (char = source[++cursor]) { + if (delimiter == '//' && char == '\n') { + break; + } else if (delimiter == '/*' && char + source[cursor + 1] == '*/') { + cursor += 2; + break; + } + } } else if (identifier.indexOf(char) > -1) { type = 'identifier'; + while ((identifier + number).indexOf(source[++cursor]) > -1) {} } else if (linebreak.indexOf(char) > -1) { type = 'linebreak'; + while (linebreak.indexOf(source[++cursor]) > -1) {} } else if (number.indexOf(char) > -1) { type = 'number'; - } else if (string.indexOf(char) > -1) { - type = 'string'; - } else if (whitespace.indexOf(char) > -1) { - type = 'whitespace'; - } else if (char == '/') { - type = isRegExp() ? 'regexp' : 'operator'; + while ((number + '.').indexOf(source[++cursor]) > -1) {} + } else if (char == '/' && isRegExp(tokens)) { + type = 'regexp'; + while ((char = source[++cursor]) != '/' && cursor < length) { + char == '\\' && ++cursor; + } + while (regexp.indexOf(source[++cursor]) > -1) {} } else if (operator.indexOf(char) > -1) { type = 'operator'; - } - tokenize[type](); - tokens.push({ - length: cursor - start, - offset: start, - type: type - }); - } - - function isRegExp() { - // checks if a forward slash is the beginning of a regexp, - // as opposed to the beginning of an operator - // see http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html#regular-expressions - var index = tokens.length, - isRegExp = false, - offset = 0, - prevToken, - prevString; - // scan back to the previous significant token, - // or the beginning of the source - while ( - tokens[--index] !== void 0 && [ - 'comment', 'linebreak', 'whitespace' - ].indexOf(tokens[index].type) > -1 - ) { - offset += tokens[index].length; - } - if (typeof tokens[index] == 'undefined') { - // source begins with forward slash - isRegExp = true; + while (operator.indexOf(char += source[++cursor]) > -1 && cursor < length) {} + } else if (string.indexOf(delimiter = char) > -1) { + type = 'string'; + while ((char = source[++cursor]) != delimiter && cursor < length) { + char == '\\' && ++cursor; + } + ++cursor; + } else if (whitespace.indexOf(char) > -1) { + type = 'whitespace'; + while (whitespace.indexOf(source[++cursor]) > -1) {} } else { - prevToken = tokens[index]; - prevString = source.substr( - cursor - prevToken.length - offset, prevToken.length - ); - isRegExp = ( - prevToken.type == 'keyword' - && ['false', 'null', 'true'].indexOf(prevString) == -1 - ) || ( - prevToken.type == 'operator' - && ['++', '--', ')', ']', '}'].indexOf(prevString) == -1 - ); + break; + } + value = source.slice(start, cursor); + tokens.push({column: column, line: line, type: type, value: value}); + if (type == 'comment') { + lines = value.split('\n'); + column = lines[lines.length - 1].length; + line += lines.length - 1; + } else if (type == 'linebreak') { + column = 0; + line += value.length; + } else { + column += value.length; } - return isRegExp; } - return tokens; - }; }());