python-ox/ox/js.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4

from ox.utils import json

def minify(source, comment=''):
    # see https://github.com/douglascrockford/JSMin/blob/master/README
    def get_next_non_whitespace_token():
        pass
    tokens = tokenize(source)
    length = len(tokens)
    minified = '/*' + comment + '*/' if comment else ''
    for i, token in enumerate(tokens):
        if token['type'] in ['linebreak', 'whitespace']:
            prevToken = None if i == 0 else tokens[i - 1]
            next = i + 1
            while next < length and tokens[next]['type'] in ['comment', 'linebreak', 'whitespace']:
                next += 1
            nextToken = None if next == length else tokens[next]
        if token['type'] == 'linebreak':
            # replace a linebreak between two tokens that are identifiers or
            # numbers or strings or unary operators or grouping operators
            # with a single newline, otherwise remove it
            if prevToken and nextToken\
                    and (prevToken['type'] in ['identifier', 'number', 'string']\
                        or prevToken['value'] in ['++', '--', ')', ']', '}'])\
                    and (nextToken['type'] in ['identifier', 'number', 'string']\
                        or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']):
                minified += '\n'
        elif token['type'] == 'whitespace':
            # replace whitespace between two tokens that are identifiers or
            # numbers, or between a token that ends with "+" or "-" and one that
            # begins with "+" or "-", with a single space, otherwise remove it
            if prevToken and nextToken\
                    and ((prevToken['type'] in ['identifier', 'number']\
                        and nextToken['type'] in ['identifier', 'number'])
                    or (prevToken['value'] in ['+', '-', '++', '--']
                        and nextToken['value'] in ['+', '-', '++', '--'])):
                minified += ' '
        elif token['type'] != 'comment':
            # remove comments and leave all other tokens untouched
            minified += token['value']
    return minified

def parse_JSONC(source):
    return json.loads(minify(source))

def tokenize(source):
    # see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js
    IDENTIFIER = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_'
    KEYWORD = [
        'break',
        'case', 'catch', 'class', 'const', 'continue',
        'debugger', 'default', 'delete', 'do',
        'else', 'enum', 'export', 'extends',
        'finally', 'for', 'function',
        'if', 'implements', 'import', 'in', 'instanceof', 'interface',
        'let', 'module',
        'new',
        'package', 'private', 'protected', 'public',
        'return',
        'super', 'switch', 'static',
        'this', 'throw', 'try', 'typeof',
        'var', 'void',
        'yield',
        'while', 'with'
    ]
    LINEBREAK = '\n\r'
    NUMBER = '01234567890'
    OPERATOR = [
        # arithmetic
        '+', '-', '*', '/', '%', '++', '--',
        # assignment
        '=', '+=', '-=', '*=', '/=', '%=',
        '&=', '|=', '^=', '<<=', '>>=', '>>>=',
        # bitwise
        '&', '|', '^', '~', '<<', '>>', '>>>',
        # comparison
        '==', '!=', '===', '!==', '>', '>=', '<', '<=',
        # conditional
        '?', ':',
        # grouping
        '(', ')', '[', ']', '{', '}',
        # logical
        '&&', '||', '!',
        # other
        '.', ',', ';'
    ]
    REGEXP = 'abcdefghijklmnopqrstuvwxyz'
    STRING = '\'"'
    WHITESPACE = ' \t'
    def is_regexp():
        # checks if a forward slash is the beginning of a regexp,
        # as opposed to the beginning of an operator
        i = len(tokens) - 1
        # scan back to the previous significant token,
        # or to the beginnig of the source
        while i >= 0 and tokens[i]['type'] in ['comment', 'linebreak', 'whitespace']:
            i -= 1
        if i == -1:
            # source begins with forward slash
            is_regexp = True
        else:
            token = tokens[i]
            is_regexp = (
                token['type'] == 'identifier' and token['value'] in KEYWORD
            ) or (
                token['type'] == 'operator' and not token['value'] in ['++', '--', ')', ']', '}']
            )
        return is_regexp
    cursor = 0
    length = len(source)
    tokens = []
    while cursor < length:
        char = source[cursor]
        start = cursor
        cursor += 1
        if char == '/' and cursor < length - 1 and source[cursor] in '/*':
            type = 'comment'
            cursor += 1
            while cursor < length:
                cursor += 1
                if source[start + 1] == '/' and source[cursor] == '\n':
                    break
                elif source[start + 1] == '*' and source[cursor:cursor + 2] == '*/':
                    cursor += 2
                    break
        elif char in IDENTIFIER:
            type = 'identifier'
            while cursor < length and source[cursor] in IDENTIFIER + NUMBER:
                cursor += 1
        elif char in LINEBREAK:
            type = 'linebreak'
            while cursor < length and source[cursor] in LINEBREAK:
                cursor += 1
        elif char in NUMBER:
            type = 'number'
            while cursor < length and source[cursor] in NUMBER + '.':
                cursor += 1
        elif char == '/' and is_regexp():
            type = 'regexp'
            while cursor < length and source[cursor] != '/':
                cursor += (2 if source[cursor] == '\\' else 1)
            cursor += 1
            while cursor < length and source[cursor] in REGEXP:
                cursor += 1
        elif char in OPERATOR:
            type = 'operator'
            if cursor < length:
                string = char + source[cursor]
                while cursor < length and string in OPERATOR:
                    cursor += 1
                    string += source[cursor]
        elif char in STRING:
            type = 'string'
            while cursor < length and source[cursor] != source[start]:
                cursor += (2 if source[cursor] == '\\' else 1)
            cursor += 1
        elif char in WHITESPACE:
            type = 'whitespace'
            while cursor < length and source[cursor] in WHITESPACE:
                cursor += 1
        tokens.append({
            'type': type,
            'value': source[start:cursor]
        })
    return tokens