improve Ox.tokenize, add Ox.identify

This commit is contained in:
rolux 2012-05-26 12:54:52 +02:00
parent 08aedd34fd
commit 5a2ecca0f1

View file

@ -134,10 +134,9 @@ Ox.doc = (function() {
tokens = [];
Ox.tokenize(source).forEach(function(token) {
var match;
token.source = source.substr(token.offset, token.length);
if (token.type == 'comment' && (
match = re.multiline.exec(token.source)
|| re.singleline.exec(token.source)
match = re.multiline.exec(token.value)
|| re.singleline.exec(token.value)
)) {
blocks.push(match[1]);
tokens.push([]);
@ -162,7 +161,9 @@ Ox.doc = (function() {
if (/^[A-Z]/.test(item.name)) {
// main item
// include leading whitespace
item.source = parseTokens(tokens[i]);
item.source = parseTokens(tokens[i]).map(function(token) {
return token.value;
}).join('');
item.line = source.slice(0, item.source[0].offset)
.split('\n').length;
items.push(item);
@ -303,6 +304,164 @@ Ox.doc = (function() {
}
}());
/*@
Ox.identify <f> Returns the type of a JavaScript identifier
(str) -> <s> Type
Type can be <code>constant</code>, <code>identifier</code>,
<code>keyword</code>, <code>method</code>, <code>object</code> or
<code>property</code>
@*/
Ox.identify = (function() {
// see https://developer.mozilla.org/en/JavaScript/Reference
var identifiers = {
constant: [
// Math
'E', 'LN2', 'LN10', 'LOG2E', 'LOG10E', 'PI', 'SQRT1_2', 'SQRT2',
// Number
'MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'POSITIVE_INFINITY'
],
method: [
// Array
'concat',
'every',
'filter', 'forEach',
'join',
'lastIndexOf',
'indexOf', 'isArray',
'map',
'pop', 'push',
'reduce', 'reduceRight', 'reverse',
'shift', 'slice', 'some', 'sort', 'splice',
'unshift',
// Date
'getDate', 'getDay', 'getFullYear', 'getHours',
'getMilliseconds', 'getMinutes', 'getMonth', 'getSeconds',
'getTime', 'getTimezoneOffset',
'getUTCDate', 'getUTCDay', 'getUTCFullYear', 'getUTCHours',
'getUTCMilliseconds', 'getUTCMinutes', 'getUTCMonth', 'getUTCSeconds',
'now',
'parse',
'setDate', 'setFullYear', 'setHours', 'setMilliseconds',
'setMinutes', 'setMonth', 'setSeconds', 'setTime',
'setUTCDate', 'setUTCFullYear', 'setUTCHours', 'setUTCMilliseconds',
'setUTCMinutes', 'setUTCMonth', 'setUTCSeconds',
'toDateString', 'toJSON', 'toLocaleDateString', 'toLocaleString',
'toLocaleTimeString', 'toTimeString', 'toUTCString',
'UTC',
// Function
'apply', 'bind', 'call', 'isGenerator',
// JSON
'parse', 'stringify',
// Math
'abs', 'acos', 'asin', 'atan', 'atan2',
'ceil', 'cos',
'exp',
'floor',
'log',
'max', 'min',
'pow',
'random', 'round',
'sin', 'sqrt',
'tan',
// Number
'toExponential', 'toFixed', 'toLocaleString', 'toPrecision',
// Object
'create',
'defineProperty', 'defineProperties',
'freeze',
'getOwnPropertyDescriptor', 'getOwnPropertyNames', 'getPrototypeOf',
'hasOwnProperty',
'isExtensible', 'isFrozen', 'isPrototypeOf', 'isSealed',
'keys',
'preventExtensions', 'propertyIsEnumerable',
'seal',
'toLocaleString', 'toString',
'valueOf',
// RegExp
'exec', 'test',
// String
'charAt', 'charCodeAt', 'concat',
'fromCharCode',
'indexOf',
'lastIndexOf', 'localeCompare',
'match',
'replace',
'search', 'slice', 'split', 'substr', 'substring',
'toLocaleLowerCase', 'toLocaleUpperCase',
'toLowerCase', 'toUpperCase', 'trim',
// Window
'addEventListener', 'alert', 'atob',
'blur', 'btoa',
'clearInterval', 'clearTimeout', 'close', 'confirm',
'dispatchEvent',
'escape',
'find', 'focus',
'getComputedStyle', 'getSelection',
'moveBy', 'moveTo',
'open',
'postMessage', 'print', 'prompt',
'removeEventListener', 'resizeBy', 'resizeTo',
'scroll', 'scrollBy', 'scrollTo',
'setCursor', 'setInterval', 'setTimeout', 'stop',
'unescape'
],
object: [
'Array',
'Boolean',
'Date', 'decodeURI', 'decodeURIComponent',
'encodeURI', 'encodeURIComponent', 'Error', 'eval', 'EvalError',
'Function',
'Infinity', 'isFinite', 'isNaN',
'JSON',
'Math',
'NaN', 'Number',
'Object',
'parseFloat', 'parseInt',
'RangeError', 'ReferenceError', 'RegExp',
'String', 'SyntaxError',
'TypeError',
'undefined', 'URIError',
'window'
],
property: [
// Function
'constructor', 'length', 'prototype',
// RegExp
'global', 'ignoreCase', 'lastIndex', 'multiline', 'source',
// Window
'applicationCache',
'closed', 'console', 'content', 'crypto',
'defaultStatus', 'document',
'frameElement', 'frames',
'history',
'innerHeight', 'innerWidth',
'length', 'location', 'locationbar', 'localStorage',
'menubar',
'name', 'navigator',
'opener', 'outerHeight', 'outerWidth',
'pageXOffset', 'pageYOffset', 'parent', 'personalbar',
'screen', 'screenX', 'screenY', 'scrollbars', 'scrollX', 'scrollY',
'self', 'sessionStorage', 'status', 'statusbar',
'toolbar', 'top'
]
};
return function(identifier) {
var ret;
if (Ox.KEYWORDS.indexOf(identifier) > -1) {
ret = 'keyword'
} else {
ret = 'identifier'
Ox.forEach(identifiers, function(words, type) {
if (words.indexOf(identifier) > -1) {
ret = type;
Ox.Break();
}
});
}
return ret;
};
})();
/*@
Ox.minify <f> Minifies JavaScript
(source) -> <s> Minified JavaScript
@ -327,36 +486,18 @@ Ox.minify = function() {
arguments[1](minify(source));
});
}
function isCommentOrLinebreakOrWhitespace(token) {
return token.type == 'comment' || isLinebreakOrWhitespace(token);
}
function isIdentifierOrNumber(token) {
return Ox.contains([
'constant', 'identifier', 'keyword',
'number', 'method', 'object', 'property'
], token.type);
}
function isIdentifierOrNumberOrString(token) {
return isIdentifierOrNumber(token) || token.type == 'string';
}
function isLinebreakOrWhitespace(token) {
return Ox.contains(['linebreak', 'whitespace'], token.type);
}
function minify(source) {
var tokens = Ox.tokenize(source),
length = tokens.length,
ret = '';
function getValue(token) {
return source.substr(token.offset, token.length);
}
tokens.forEach(function(token, i) {
var next, nextToken, previousToken;
if (isLinebreakOrWhitespace(token)) {
previousToken = i == 0 ? null : tokens[i - 1];
var next, nextToken, prevToken;
if (['linebreak', 'whitespace'].indexOf(token.type) > -1) {
prevToken = i == 0 ? null : tokens[i - 1];
next = i + 1;
while (
next < length
&& isCommentOrLinebreakOrWhitespace(tokens[next])
next < length && ['comment', 'linebreak', 'whitespace']
.indexOf(tokens[next].type) > -1
) {
next++;
}
@ -367,16 +508,12 @@ Ox.minify = function() {
// or numbers or strings or unary operators or grouping
// operators with a single newline, otherwise remove it
if (
previousToken && nextToken && (
isIdentifierOrNumberOrString(previousToken)
|| Ox.contains([
'++', '--', ')', ']', '}'
], getValue(previousToken))
prevToken && nextToken && (
['identifier', 'number', 'string'].indexOf(prevToken.type) > -1
|| ['++', '--', ')', ']', '}'].indexOf(prevToken.value) > -1
) && (
isIdentifierOrNumberOrString(nextToken)
|| Ox.contains([
'+', '-', '++', '--', '~', '!', '(', '[', '{'
], getValue(nextToken))
['identifier', 'number', 'string'].indexOf(nextToken.type) > -1
|| ['+', '-', '++', '--', '~', '!', '(', '[', '{'].indexOf(nextToken.value) > -1
)
) {
ret += '\n';
@ -387,23 +524,19 @@ Ox.minify = function() {
// that begins with "+" or "-", with a single space, otherwise
// remove it
if (
previousToken && nextToken && ((
isIdentifierOrNumber(previousToken)
&& isIdentifierOrNumber(nextToken)
prevToken && nextToken && ((
['identifier', 'number'].indexOf(prevToken.type) > -1
&& ['identifier', 'number'].indexOf(nextToken.type) > -1
) || (
Ox.contains([
'+', '-', '++', '--'
], getValue(previousToken))
&& Ox.contains([
'+', '-', '++', '--'
], getValue(nextToken))
['+', '-', '++', '--'].indexOf(prevToken.value) > -1
&& ['+', '-', '++', '--'].indexOf(nextToken.value) > -1
))
) {
ret += ' ';
}
} else if (token.type != 'comment') {
// remove comments and leave all other tokens untouched
ret += getValue(token);
ret += token.value;
}
});
return ret;
@ -443,28 +576,23 @@ Ox.test = function(file, callback) {
/*@
Ox.tokenize <f> Tokenizes JavaScript
(source) -> <[o]> Array of tokens
length <n> Length of the token
offset <n> Offset of the token
column <n> Column of the token
line <n> Line of the token
type <s> Type of the token
Type can be <code>"comment"</code>, <code>"constant"</code>,
<code>"identifier"</code>, <code>"keyword"</code>,
<code>"linebreak"</code>, <code>"method"</code>,
<code>"number"</code>, <code>"object"</code>,
<code>"operator"</code>, <code>"property"</code>,
<code>"regexp"</code>, <code>"string"</code>
or <code>"whitespace"</code>
Type can be <code>"comment"</code>, <code>"identifier"</code>,
<code>"linebreak"</code>, <code>"number"</code>,
<code>"operator"</code>, <code>"regexp"</code>,
<code>"string"</code> or <code>"whitespace"</code>
value <s> Value of the token
source <s> JavaScript source code
@*/
// FIXME: constant/method/object/property is of interest
// for syntax highlighting, but may not belong here
// FIXME: backport python version
// FIXME: numbers (hex, exp, etc.)
Ox.tokenize = (function() {
// see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js
// and https://developer.mozilla.org/en/JavaScript/Reference
// see https://github.com/mozilla/narcissus/blob/master/lib/lexer.js
var identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_',
var comment = ['//', '/*'],
identifier = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_',
linebreak = '\n\r',
number = '0123456789',
operator = [
@ -488,286 +616,108 @@ Ox.tokenize = (function() {
],
regexp = 'abcdefghijklmnopqrstuvwxyz',
string = '\'"',
whitespace = ' \t',
word = {
constant: [
// Math
'E', 'LN2', 'LN10', 'LOG2E', 'LOG10E', 'PI', 'SQRT1_2', 'SQRT2',
// Number
'MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'POSITIVE_INFINITY'
],
keyword: [
'break',
'case', 'catch', 'class', 'const', 'continue',
'debugger', 'default', 'delete', 'do',
'else', 'enum', 'export', 'extends',
'false', 'finally', 'for', 'function',
'if', 'implements', 'import', 'in', 'instanceof', 'interface',
'let', 'module',
'new', 'null',
'package', 'private', 'protected', 'public',
'return',
'super', 'switch', 'static',
'this', 'throw', 'true', 'try', 'typeof',
'var', 'void',
'yield',
'while', 'with',
],
method: [
// Array
'concat',
'every',
'filter', 'forEach',
'join',
'lastIndexOf',
'indexOf', 'isArray',
'map',
'pop', 'push',
'reduce', 'reduceRight', 'reverse',
'shift', 'slice', 'some', 'sort', 'splice',
'unshift',
// Date
'getDate', 'getDay', 'getFullYear', 'getHours',
'getMilliseconds', 'getMinutes', 'getMonth', 'getSeconds',
'getTime', 'getTimezoneOffset',
'getUTCDate', 'getUTCDay', 'getUTCFullYear', 'getUTCHours',
'getUTCMilliseconds', 'getUTCMinutes', 'getUTCMonth', 'getUTCSeconds',
'now',
'parse',
'setDate', 'setFullYear', 'setHours', 'setMilliseconds',
'setMinutes', 'setMonth', 'setSeconds', 'setTime',
'setUTCDate', 'setUTCFullYear', 'setUTCHours', 'setUTCMilliseconds',
'setUTCMinutes', 'setUTCMonth', 'setUTCSeconds',
'toDateString', 'toJSON', 'toLocaleDateString', 'toLocaleString',
'toLocaleTimeString', 'toTimeString', 'toUTCString',
'UTC',
// Function
'apply', 'bind', 'call', 'isGenerator',
// JSON
'parse', 'stringify',
// Math
'abs', 'acos', 'asin', 'atan', 'atan2',
'ceil', 'cos',
'exp',
'floor',
'log',
'max', 'min',
'pow',
'random', 'round',
'sin', 'sqrt',
'tan',
// Number
'toExponential', 'toFixed', 'toLocaleString', 'toPrecision',
// Object
'create',
'defineProperty', 'defineProperties',
'freeze',
'getOwnPropertyDescriptor', 'getOwnPropertyNames', 'getPrototypeOf',
'hasOwnProperty',
'isExtensible', 'isFrozen', 'isPrototypeOf', 'isSealed',
'keys',
'preventExtensions', 'propertyIsEnumerable',
'seal',
'toLocaleString', 'toString',
'valueOf',
// RegExp
'exec', 'test',
// String
'charAt', 'charCodeAt', 'concat',
'fromCharCode',
'indexOf',
'lastIndexOf', 'localeCompare',
'match',
'replace',
'search', 'slice', 'split', 'substr', 'substring',
'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toUpperCase', 'trim',
// Window
'addEventListener', 'alert', 'atob',
'blur', 'btoa',
'clearInterval', 'clearTimeout', 'close', 'confirm',
'dispatchEvent',
'escape',
'find', 'focus',
'getComputedStyle', 'getSelection',
'moveBy', 'moveTo',
'open',
'postMessage', 'print', 'prompt',
'removeEventListener', 'resizeBy', 'resizeTo',
'scroll', 'scrollBy', 'scrollTo',
'setCursor', 'setInterval', 'setTimeout', 'stop',
'unescape'
],
object: [
'Array',
'Boolean',
'Date', 'decodeURI', 'decodeURIComponent',
'encodeURI', 'encodeURIComponent', 'Error', 'eval', 'EvalError',
'Function',
'Infinity', 'isFinite', 'isNaN',
'JSON',
'Math',
'NaN', 'Number',
'Object',
'parseFloat', 'parseInt',
'RangeError', 'ReferenceError', 'RegExp',
'String', 'SyntaxError',
'TypeError',
'undefined', 'URIError',
'window'
],
property: [
// Function
'constructor', 'length', 'prototype',
// RegExp
'global', 'ignoreCase', 'lastIndex', 'multiline', 'source',
// Window
'applicationCache',
'closed', 'console', 'content', 'crypto',
'defaultStatus', 'document',
'frameElement', 'frames',
'history',
'innerHeight', 'innerWidth',
'length', 'location', 'locationbar', 'localStorage',
'menubar',
'name', 'navigator',
'opener', 'outerHeight', 'outerWidth',
'pageXOffset', 'pageYOffset', 'parent', 'personalbar',
'screen', 'screenX', 'screenY', 'scrollbars', 'scrollX', 'scrollY',
'self', 'sessionStorage', 'status', 'statusbar',
'toolbar', 'top'
]
};
whitespace = ' \t';
function isRegExp(tokens) {
// Returns true if the current token is the beginning of a RegExp, as
// opposed to the beginning of an operator
var i = tokens.length - 1, isRegExp, token
// Scan back to the previous significant token, or to the beginning of
// the source
while (i >= 0 && [
'comment', 'linebreak', 'whitespace'
].indexOf(tokens[i].type) > -1) {
i--;
}
if (i == -1) {
// Source begins with a forward slash
isRegExp = true;
} else {
token = tokens[i];
isRegExp = (
token.type == 'identifier'
&& Ox.identify(token.value) == 'keyword'
&& ['false', 'null', 'true'].indexOf(token.value) == -1
) || (
token.type == 'operator'
&& ['++', '--', ')', ']', '}'].indexOf(token.value) == -1
)
}
return isRegExp;
}
return function(source) {
source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
var cursor = 0,
tokenize = {
comment: function() {
while (char = source[++cursor]) {
if (next == '/' && char == '\n') {
break;
} else if (next == '*' && char + source[cursor + 1] == '*/') {
cursor += 2;
break;
}
}
},
identifier: function() {
var str;
while ((identifier + number).indexOf(source[++cursor]) > -1) {}
str = source.slice(start, cursor);
Ox.forEach(word, function(value, key) {
if (value.indexOf(str) > -1) {
type = key;
Ox.Break();
}
});
},
linebreak: function() {
while (linebreak.indexOf(source[++cursor]) > -1) {}
},
number: function() {
while ((number + '.').indexOf(source[++cursor]) > -1) {}
},
operator: function() {
while (operator.indexOf(char += source[++cursor]) > -1) {}
},
regexp: function() {
while ((char = source[++cursor]) != '/') {
char == '\\' && ++cursor;
if (cursor == source.length) {
break;
}
}
while (regexp.indexOf(source[++cursor]) > -1) {}
},
string: function() {
var delimiter = char;
while ((char = source[++cursor]) != delimiter) {
char == '\\' && ++cursor;
if (cursor == source.length) {
break;
}
}
++cursor;
},
whitespace: function() {
while (whitespace.indexOf(source[++cursor]) > -1) {}
}
},
var char,
column = 0,
cursor = 0,
delimiter,
length = source.length,
line = 0,
lines,
next,
tokens = [],
type;
while (cursor < source.length) {
var char = source[cursor],
next = source[cursor + 1],
start = cursor;
if (char == '/' && (next == '/' || next == '*')) {
start,
type,
value;
source = source.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
while (cursor < length) {
start = cursor;
char = source[cursor];
if (comment.indexOf(delimiter = char + source[cursor + 1]) > -1) {
type = 'comment';
++cursor;
while (char = source[++cursor]) {
if (delimiter == '//' && char == '\n') {
break;
} else if (delimiter == '/*' && char + source[cursor + 1] == '*/') {
cursor += 2;
break;
}
}
} else if (identifier.indexOf(char) > -1) {
type = 'identifier';
while ((identifier + number).indexOf(source[++cursor]) > -1) {}
} else if (linebreak.indexOf(char) > -1) {
type = 'linebreak';
while (linebreak.indexOf(source[++cursor]) > -1) {}
} else if (number.indexOf(char) > -1) {
type = 'number';
} else if (string.indexOf(char) > -1) {
type = 'string';
} else if (whitespace.indexOf(char) > -1) {
type = 'whitespace';
} else if (char == '/') {
type = isRegExp() ? 'regexp' : 'operator';
while ((number + '.').indexOf(source[++cursor]) > -1) {}
} else if (char == '/' && isRegExp(tokens)) {
type = 'regexp';
while ((char = source[++cursor]) != '/' && cursor < length) {
char == '\\' && ++cursor;
}
while (regexp.indexOf(source[++cursor]) > -1) {}
} else if (operator.indexOf(char) > -1) {
type = 'operator';
}
tokenize[type]();
tokens.push({
length: cursor - start,
offset: start,
type: type
});
}
function isRegExp() {
// checks if a forward slash is the beginning of a regexp,
// as opposed to the beginning of an operator
// see http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html#regular-expressions
var index = tokens.length,
isRegExp = false,
offset = 0,
prevToken,
prevString;
// scan back to the previous significant token,
// or the beginning of the source
while (
tokens[--index] !== void 0 && [
'comment', 'linebreak', 'whitespace'
].indexOf(tokens[index].type) > -1
) {
offset += tokens[index].length;
}
if (typeof tokens[index] == 'undefined') {
// source begins with forward slash
isRegExp = true;
while (operator.indexOf(char += source[++cursor]) > -1 && cursor < length) {}
} else if (string.indexOf(delimiter = char) > -1) {
type = 'string';
while ((char = source[++cursor]) != delimiter && cursor < length) {
char == '\\' && ++cursor;
}
++cursor;
} else if (whitespace.indexOf(char) > -1) {
type = 'whitespace';
while (whitespace.indexOf(source[++cursor]) > -1) {}
} else {
prevToken = tokens[index];
prevString = source.substr(
cursor - prevToken.length - offset, prevToken.length
);
isRegExp = (
prevToken.type == 'keyword'
&& ['false', 'null', 'true'].indexOf(prevString) == -1
) || (
prevToken.type == 'operator'
&& ['++', '--', ')', ']', '}'].indexOf(prevString) == -1
);
break;
}
value = source.slice(start, cursor);
tokens.push({column: column, line: line, type: type, value: value});
if (type == 'comment') {
lines = value.split('\n');
column = lines[lines.length - 1].length;
line += lines.length - 1;
} else if (type == 'linebreak') {
column = 0;
line += value.length;
} else {
column += value.length;
}
return isRegExp;
}
return tokens;
};
}());