improve Ox.tokenize, add Ox.identify

This commit is contained in:
rolux 2012-05-26 12:54:52 +02:00
parent 08aedd34fd
commit 5a2ecca0f1

View file

@ -134,10 +134,9 @@ Ox.doc = (function() {
tokens = []; tokens = [];
Ox.tokenize(source).forEach(function(token) { Ox.tokenize(source).forEach(function(token) {
var match; var match;
token.source = source.substr(token.offset, token.length);
if (token.type == 'comment' && ( if (token.type == 'comment' && (
match = re.multiline.exec(token.source) match = re.multiline.exec(token.value)
|| re.singleline.exec(token.source) || re.singleline.exec(token.value)
)) { )) {
blocks.push(match[1]); blocks.push(match[1]);
tokens.push([]); tokens.push([]);
@ -162,7 +161,9 @@ Ox.doc = (function() {
if (/^[A-Z]/.test(item.name)) { if (/^[A-Z]/.test(item.name)) {
// main item // main item
// include leading whitespace // include leading whitespace
item.source = parseTokens(tokens[i]); item.source = parseTokens(tokens[i]).map(function(token) {
return token.value;
}).join('');
item.line = source.slice(0, item.source[0].offset) item.line = source.slice(0, item.source[0].offset)
.split('\n').length; .split('\n').length;
items.push(item); items.push(item);
@ -303,6 +304,164 @@ Ox.doc = (function() {
} }
}()); }());
/*@
Ox.identify <f> Returns the type of a JavaScript identifier
(str) -> <s> Type
Type can be <code>constant</code>, <code>identifier</code>,
<code>keyword</code>, <code>method</code>, <code>object</code> or
<code>property</code>
@*/
Ox.identify = (function() {
// see https://developer.mozilla.org/en/JavaScript/Reference
var identifiers = {
constant: [
// Math
'E', 'LN2', 'LN10', 'LOG2E', 'LOG10E', 'PI', 'SQRT1_2', 'SQRT2',
// Number
'MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'POSITIVE_INFINITY'
],
method: [
// Array
'concat',
'every',
'filter', 'forEach',
'join',
'lastIndexOf',
'indexOf', 'isArray',
'map',
'pop', 'push',
'reduce', 'reduceRight', 'reverse',
'shift', 'slice', 'some', 'sort', 'splice',
'unshift',
// Date
'getDate', 'getDay', 'getFullYear', 'getHours',
'getMilliseconds', 'getMinutes', 'getMonth', 'getSeconds',
'getTime', 'getTimezoneOffset',
'getUTCDate', 'getUTCDay', 'getUTCFullYear', 'getUTCHours',
'getUTCMilliseconds', 'getUTCMinutes', 'getUTCMonth', 'getUTCSeconds',
'now',
'parse',
'setDate', 'setFullYear', 'setHours', 'setMilliseconds',
'setMinutes', 'setMonth', 'setSeconds', 'setTime',
'setUTCDate', 'setUTCFullYear', 'setUTCHours', 'setUTCMilliseconds',
'setUTCMinutes', 'setUTCMonth', 'setUTCSeconds',
'toDateString', 'toJSON', 'toLocaleDateString', 'toLocaleString',
'toLocaleTimeString', 'toTimeString', 'toUTCString',
'UTC',
// Function
'apply', 'bind', 'call', 'isGenerator',
// JSON
'parse', 'stringify',
// Math
'abs', 'acos', 'asin', 'atan', 'atan2',
'ceil', 'cos',
'exp',
'floor',
'log',
'max', 'min',
'pow',
'random', 'round',
'sin', 'sqrt',
'tan',
// Number
'toExponential', 'toFixed', 'toLocaleString', 'toPrecision',
// Object
'create',
'defineProperty', 'defineProperties',
'freeze',
'getOwnPropertyDescriptor', 'getOwnPropertyNames', 'getPrototypeOf',
'hasOwnProperty',
'isExtensible', 'isFrozen', 'isPrototypeOf', 'isSealed',
'keys',
'preventExtensions', 'propertyIsEnumerable',
'seal',
'toLocaleString', 'toString',
'valueOf',
// RegExp
'exec', 'test',
// String
'charAt', 'charCodeAt', 'concat',
'fromCharCode',
'indexOf',
'lastIndexOf', 'localeCompare',
'match',
'replace',
'search', 'slice', 'split', 'substr', 'substring',
'toLocaleLowerCase', 'toLocaleUpperCase',
'toLowerCase', 'toUpperCase', 'trim',
// Window
'addEventListener', 'alert', 'atob',
'blur', 'btoa',
'clearInterval', 'clearTimeout', 'close', 'confirm',
'dispatchEvent',
'escape',
'find', 'focus',
'getComputedStyle', 'getSelection',
'moveBy', 'moveTo',
'open',
'postMessage', 'print', 'prompt',
'removeEventListener', 'resizeBy', 'resizeTo',
'scroll', 'scrollBy', 'scrollTo',
'setCursor', 'setInterval', 'setTimeout', 'stop',
'unescape'
],
object: [
'Array',
'Boolean',
'Date', 'decodeURI', 'decodeURIComponent',
'encodeURI', 'encodeURIComponent', 'Error', 'eval', 'EvalError',
'Function',
'Infinity', 'isFinite', 'isNaN',
'JSON',
'Math',
'NaN', 'Number',
'Object',
'parseFloat', 'parseInt',
'RangeError', 'ReferenceError', 'RegExp',
'String', 'SyntaxError',
'TypeError',
'undefined', 'URIError',
'window'
],
property: [
// Function
'constructor', 'length', 'prototype',
// RegExp
'global', 'ignoreCase', 'lastIndex', 'multiline', 'source',
// Window
'applicationCache',
'closed', 'console', 'content', 'crypto',
'defaultStatus', 'document',
'frameElement', 'frames',
'history',
'innerHeight', 'innerWidth',
'length', 'location', 'locationbar', 'localStorage',
'menubar',
'name', 'navigator',
'opener', 'outerHeight', 'outerWidth',
'pageXOffset', 'pageYOffset', 'parent', 'personalbar',
'screen', 'screenX', 'screenY', 'scrollbars', 'scrollX', 'scrollY',
'self', 'sessionStorage', 'status', 'statusbar',
'toolbar', 'top'
]
};
return function(identifier) {
var ret;
if (Ox.KEYWORDS.indexOf(identifier) > -1) {
ret = 'keyword'
} else {
ret = 'identifier'
Ox.forEach(identifiers, function(words, type) {
if (words.indexOf(identifier) > -1) {
ret = type;
Ox.Break();
}
});
}
return ret;
};
})();
/*@ /*@
Ox.minify <f> Minifies JavaScript Ox.minify <f> Minifies JavaScript
(source) -> <s> Minified JavaScript (source) -> <s> Minified JavaScript
@ -327,36 +486,18 @@ Ox.minify = function() {
arguments[1](minify(source)); arguments[1](minify(source));
}); });
} }
function isCommentOrLinebreakOrWhitespace(token) {
return token.type == 'comment' || isLinebreakOrWhitespace(token);
}
function isIdentifierOrNumber(token) {
return Ox.contains([
'constant', 'identifier', 'keyword',
'number', 'method', 'object', 'property'
], token.type);
}
function isIdentifierOrNumberOrString(token) {
return isIdentifierOrNumber(token) || token.type == 'string';
}
function isLinebreakOrWhitespace(token) {
return Ox.contains(['linebreak', 'whitespace'], token.type);
}
function minify(source) { function minify(source) {
var tokens = Ox.tokenize(source), var tokens = Ox.tokenize(source),
length = tokens.length, length = tokens.length,
ret = ''; ret = '';
function getValue(token) {
return source.substr(token.offset, token.length);
}
tokens.forEach(function(token, i) { tokens.forEach(function(token, i) {
var next, nextToken, previousToken; var next, nextToken, prevToken;
if (isLinebreakOrWhitespace(token)) { if (['linebreak', 'whitespace'].indexOf(token.type) > -1) {
previousToken = i == 0 ? null : tokens[i - 1]; prevToken = i == 0 ? null : tokens[i - 1];
next = i + 1; next = i + 1;
while ( while (
next < length next < length && ['comment', 'linebreak', 'whitespace']
&& isCommentOrLinebreakOrWhitespace(tokens[next]) .indexOf(tokens[next].type) > -1
) { ) {
next++; next++;
} }
@ -367,16 +508,12 @@ Ox.minify = function() {
// or numbers or strings or unary operators or grouping // or numbers or strings or unary operators or grouping
// operators with a single newline, otherwise remove it // operators with a single newline, otherwise remove it
if ( if (
previousToken && nextToken && ( prevToken && nextToken && (
isIdentifierOrNumberOrString(previousToken) ['identifier', 'number', 'string'].indexOf(prevToken.type) > -1
|| Ox.contains([ || ['++', '--', ')', ']', '}'].indexOf(prevToken.value) > -1
'++', '--', ')', ']', '}'
], getValue(previousToken))
) && ( ) && (
isIdentifierOrNumberOrString(nextToken) ['identifier', 'number', 'string'].indexOf(nextToken.type) > -1
|| Ox.contains([ || ['+', '-', '++', '--', '~', '!', '(', '[', '{'].indexOf(nextToken.value) > -1
'+', '-', '++', '--', '~', '!', '(', '[', '{'
], getValue(nextToken))
) )
) { ) {
ret += '\n'; ret += '\n';
@ -387,23 +524,19 @@ Ox.minify = function() {
// that begins with "+" or "-", with a single space, otherwise // that begins with "+" or "-", with a single space, otherwise
// remove it // remove it
if ( if (
previousToken && nextToken && (( prevToken && nextToken && ((
isIdentifierOrNumber(previousToken) ['identifier', 'number'].indexOf(prevToken.type) > -1
&& isIdentifierOrNumber(nextToken) && ['identifier', 'number'].indexOf(nextToken.type) > -1
) || ( ) || (
Ox.contains([ ['+', '-', '++', '--'].indexOf(prevToken.value) > -1
'+', '-', '++', '--' && ['+', '-', '++', '--'].indexOf(nextToken.value) > -1
], getValue(previousToken))
&& Ox.contains([
'+', '-', '++', '--'
], getValue(nextToken))
)) ))
) { ) {
ret += ' '; ret += ' ';
} }
} else if (token.type != 'comment') { } else if (token.type != 'comment') {
// remove comments and leave all other tokens untouched // remove comments and leave all other tokens untouched
ret += getValue(token); ret += token.value;
} }
}); });
return ret; return ret;
@ -443,28 +576,23 @@ Ox.test = function(file, callback) {
/*@ /*@
Ox.tokenize <f> Tokenizes JavaScript Ox.tokenize <f> Tokenizes JavaScript
(source) -> <[o]> Array of tokens (source) -> <[o]> Array of tokens
length <n> Length of the token column <n> Column of the token
offset <n> Offset of the token line <n> Line of the token
type <s> Type of the token type <s> Type of the token
Type can be <code>"comment"</code>, <code>"constant"</code>, Type can be <code>"comment"</code>, <code>"identifier"</code>,
<code>"identifier"</code>, <code>"keyword"</code>, <code>"linebreak"</code>, <code>"number"</code>,
<code>"linebreak"</code>, <code>"method"</code>, <code>"operator"</code>, <code>"regexp"</code>,
<code>"number"</code>, <code>"object"</code>, <code>"string"</code> or <code>"whitespace"</code>
<code>"operator"</code>, <code>"property"</code>, value <s> Value of the token
<code>"regexp"</code>, <code>"string"</code>
or <code>"whitespace"</code>
source <s> JavaScript source code source <s> JavaScript source code
@*/ @*/
// FIXME: constant/method/object/property is of interest
// for syntax highlighting, but may not belong here
// FIXME: backport python version
// FIXME: numbers (hex, exp, etc.) // FIXME: numbers (hex, exp, etc.)
Ox.tokenize = (function() { Ox.tokenize = (function() {
// see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js // see https://github.com/mozilla/narcissus/blob/master/lib/lexer.js
// and https://developer.mozilla.org/en/JavaScript/Reference
var identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_', var comment = ['//', '/*'],
identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_',
linebreak = '\n\r', linebreak = '\n\r',
number = '0123456789', number = '0123456789',
operator = [ operator = [
@ -488,286 +616,108 @@ Ox.tokenize = (function() {
], ],
regexp = 'abcdefghijklmnopqrstuvwxyz', regexp = 'abcdefghijklmnopqrstuvwxyz',
string = '\'"', string = '\'"',
whitespace = ' \t', whitespace = ' \t';
word = {
constant: [ function isRegExp(tokens) {
// Math // Returns true if the current token is the beginning of a RegExp, as
'E', 'LN2', 'LN10', 'LOG2E', 'LOG10E', 'PI', 'SQRT1_2', 'SQRT2', // opposed to the beginning of an operator
// Number var i = tokens.length - 1, isRegExp, token
'MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'POSITIVE_INFINITY' // Scan back to the previous significant token, or to the beginning of
], // the source
keyword: [ while (i >= 0 && [
'break', 'comment', 'linebreak', 'whitespace'
'case', 'catch', 'class', 'const', 'continue', ].indexOf(tokens[i].type) > -1) {
'debugger', 'default', 'delete', 'do', i--;
'else', 'enum', 'export', 'extends', }
'false', 'finally', 'for', 'function', if (i == -1) {
'if', 'implements', 'import', 'in', 'instanceof', 'interface', // Source begins with a forward slash
'let', 'module', isRegExp = true;
'new', 'null', } else {
'package', 'private', 'protected', 'public', token = tokens[i];
'return', isRegExp = (
'super', 'switch', 'static', token.type == 'identifier'
'this', 'throw', 'true', 'try', 'typeof', && Ox.identify(token.value) == 'keyword'
'var', 'void', && ['false', 'null', 'true'].indexOf(token.value) == -1
'yield', ) || (
'while', 'with', token.type == 'operator'
], && ['++', '--', ')', ']', '}'].indexOf(token.value) == -1
method: [ )
// Array }
'concat', return isRegExp;
'every', }
'filter', 'forEach',
'join',
'lastIndexOf',
'indexOf', 'isArray',
'map',
'pop', 'push',
'reduce', 'reduceRight', 'reverse',
'shift', 'slice', 'some', 'sort', 'splice',
'unshift',
// Date
'getDate', 'getDay', 'getFullYear', 'getHours',
'getMilliseconds', 'getMinutes', 'getMonth', 'getSeconds',
'getTime', 'getTimezoneOffset',
'getUTCDate', 'getUTCDay', 'getUTCFullYear', 'getUTCHours',
'getUTCMilliseconds', 'getUTCMinutes', 'getUTCMonth', 'getUTCSeconds',
'now',
'parse',
'setDate', 'setFullYear', 'setHours', 'setMilliseconds',
'setMinutes', 'setMonth', 'setSeconds', 'setTime',
'setUTCDate', 'setUTCFullYear', 'setUTCHours', 'setUTCMilliseconds',
'setUTCMinutes', 'setUTCMonth', 'setUTCSeconds',
'toDateString', 'toJSON', 'toLocaleDateString', 'toLocaleString',
'toLocaleTimeString', 'toTimeString', 'toUTCString',
'UTC',
// Function
'apply', 'bind', 'call', 'isGenerator',
// JSON
'parse', 'stringify',
// Math
'abs', 'acos', 'asin', 'atan', 'atan2',
'ceil', 'cos',
'exp',
'floor',
'log',
'max', 'min',
'pow',
'random', 'round',
'sin', 'sqrt',
'tan',
// Number
'toExponential', 'toFixed', 'toLocaleString', 'toPrecision',
// Object
'create',
'defineProperty', 'defineProperties',
'freeze',
'getOwnPropertyDescriptor', 'getOwnPropertyNames', 'getPrototypeOf',
'hasOwnProperty',
'isExtensible', 'isFrozen', 'isPrototypeOf', 'isSealed',
'keys',
'preventExtensions', 'propertyIsEnumerable',
'seal',
'toLocaleString', 'toString',
'valueOf',
// RegExp
'exec', 'test',
// String
'charAt', 'charCodeAt', 'concat',
'fromCharCode',
'indexOf',
'lastIndexOf', 'localeCompare',
'match',
'replace',
'search', 'slice', 'split', 'substr', 'substring',
'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toUpperCase', 'trim',
// Window
'addEventListener', 'alert', 'atob',
'blur', 'btoa',
'clearInterval', 'clearTimeout', 'close', 'confirm',
'dispatchEvent',
'escape',
'find', 'focus',
'getComputedStyle', 'getSelection',
'moveBy', 'moveTo',
'open',
'postMessage', 'print', 'prompt',
'removeEventListener', 'resizeBy', 'resizeTo',
'scroll', 'scrollBy', 'scrollTo',
'setCursor', 'setInterval', 'setTimeout', 'stop',
'unescape'
],
object: [
'Array',
'Boolean',
'Date', 'decodeURI', 'decodeURIComponent',
'encodeURI', 'encodeURIComponent', 'Error', 'eval', 'EvalError',
'Function',
'Infinity', 'isFinite', 'isNaN',
'JSON',
'Math',
'NaN', 'Number',
'Object',
'parseFloat', 'parseInt',
'RangeError', 'ReferenceError', 'RegExp',
'String', 'SyntaxError',
'TypeError',
'undefined', 'URIError',
'window'
],
property: [
// Function
'constructor', 'length', 'prototype',
// RegExp
'global', 'ignoreCase', 'lastIndex', 'multiline', 'source',
// Window
'applicationCache',
'closed', 'console', 'content', 'crypto',
'defaultStatus', 'document',
'frameElement', 'frames',
'history',
'innerHeight', 'innerWidth',
'length', 'location', 'locationbar', 'localStorage',
'menubar',
'name', 'navigator',
'opener', 'outerHeight', 'outerWidth',
'pageXOffset', 'pageYOffset', 'parent', 'personalbar',
'screen', 'screenX', 'screenY', 'scrollbars', 'scrollX', 'scrollY',
'self', 'sessionStorage', 'status', 'statusbar',
'toolbar', 'top'
]
};
return function(source) { return function(source) {
var char,
source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); column = 0,
cursor = 0,
var cursor = 0, delimiter,
tokenize = { length = source.length,
comment: function() { line = 0,
while (char = source[++cursor]) { lines,
if (next == '/' && char == '\n') { next,
break;
} else if (next == '*' && char + source[cursor + 1] == '*/') {
cursor += 2;
break;
}
}
},
identifier: function() {
var str;
while ((identifier + number).indexOf(source[++cursor]) > -1) {}
str = source.slice(start, cursor);
Ox.forEach(word, function(value, key) {
if (value.indexOf(str) > -1) {
type = key;
Ox.Break();
}
});
},
linebreak: function() {
while (linebreak.indexOf(source[++cursor]) > -1) {}
},
number: function() {
while ((number + '.').indexOf(source[++cursor]) > -1) {}
},
operator: function() {
while (operator.indexOf(char += source[++cursor]) > -1) {}
},
regexp: function() {
while ((char = source[++cursor]) != '/') {
char == '\\' && ++cursor;
if (cursor == source.length) {
break;
}
}
while (regexp.indexOf(source[++cursor]) > -1) {}
},
string: function() {
var delimiter = char;
while ((char = source[++cursor]) != delimiter) {
char == '\\' && ++cursor;
if (cursor == source.length) {
break;
}
}
++cursor;
},
whitespace: function() {
while (whitespace.indexOf(source[++cursor]) > -1) {}
}
},
tokens = [], tokens = [],
type; start,
type,
while (cursor < source.length) { value;
var char = source[cursor], source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
next = source[cursor + 1], while (cursor < length) {
start = cursor; start = cursor;
if (char == '/' && (next == '/' || next == '*')) { char = source[cursor];
if (comment.indexOf(delimiter = char + source[cursor + 1]) > -1) {
type = 'comment'; type = 'comment';
++cursor;
while (char = source[++cursor]) {
if (delimiter == '//' && char == '\n') {
break;
} else if (delimiter == '/*' && char + source[cursor + 1] == '*/') {
cursor += 2;
break;
}
}
} else if (identifier.indexOf(char) > -1) { } else if (identifier.indexOf(char) > -1) {
type = 'identifier'; type = 'identifier';
while ((identifier + number).indexOf(source[++cursor]) > -1) {}
} else if (linebreak.indexOf(char) > -1) { } else if (linebreak.indexOf(char) > -1) {
type = 'linebreak'; type = 'linebreak';
while (linebreak.indexOf(source[++cursor]) > -1) {}
} else if (number.indexOf(char) > -1) { } else if (number.indexOf(char) > -1) {
type = 'number'; type = 'number';
} else if (string.indexOf(char) > -1) { while ((number + '.').indexOf(source[++cursor]) > -1) {}
type = 'string'; } else if (char == '/' && isRegExp(tokens)) {
} else if (whitespace.indexOf(char) > -1) { type = 'regexp';
type = 'whitespace'; while ((char = source[++cursor]) != '/' && cursor < length) {
} else if (char == '/') { char == '\\' && ++cursor;
type = isRegExp() ? 'regexp' : 'operator'; }
while (regexp.indexOf(source[++cursor]) > -1) {}
} else if (operator.indexOf(char) > -1) { } else if (operator.indexOf(char) > -1) {
type = 'operator'; type = 'operator';
} while (operator.indexOf(char += source[++cursor]) > -1 && cursor < length) {}
tokenize[type](); } else if (string.indexOf(delimiter = char) > -1) {
tokens.push({ type = 'string';
length: cursor - start, while ((char = source[++cursor]) != delimiter && cursor < length) {
offset: start, char == '\\' && ++cursor;
type: type }
}); ++cursor;
} } else if (whitespace.indexOf(char) > -1) {
type = 'whitespace';
function isRegExp() { while (whitespace.indexOf(source[++cursor]) > -1) {}
// checks if a forward slash is the beginning of a regexp,
// as opposed to the beginning of an operator
// see http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html#regular-expressions
var index = tokens.length,
isRegExp = false,
offset = 0,
prevToken,
prevString;
// scan back to the previous significant token,
// or the beginning of the source
while (
tokens[--index] !== void 0 && [
'comment', 'linebreak', 'whitespace'
].indexOf(tokens[index].type) > -1
) {
offset += tokens[index].length;
}
if (typeof tokens[index] == 'undefined') {
// source begins with forward slash
isRegExp = true;
} else { } else {
prevToken = tokens[index]; break;
prevString = source.substr( }
cursor - prevToken.length - offset, prevToken.length value = source.slice(start, cursor);
); tokens.push({column: column, line: line, type: type, value: value});
isRegExp = ( if (type == 'comment') {
prevToken.type == 'keyword' lines = value.split('\n');
&& ['false', 'null', 'true'].indexOf(prevString) == -1 column = lines[lines.length - 1].length;
) || ( line += lines.length - 1;
prevToken.type == 'operator' } else if (type == 'linebreak') {
&& ['++', '--', ')', ']', '}'].indexOf(prevString) == -1 column = 0;
); line += value.length;
} else {
column += value.length;
} }
return isRegExp;
} }
return tokens; return tokens;
}; };
}()); }());