add js module (tokenize, minify)

2011-10-06 22:05:01 +02:00 · 2011-10-06 22:05:01 +02:00 · 409d5da90d
commit 409d5da90d
parent 0050c403ab
2 changed files with 157 additions and 11 deletions
--- a/ox/init.py
+++ b/ox/init.py
@ -1,17 +1,18 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
-# GPL 2008
+# GPL 2011
-__version__ = '2.0.0'
+__version__ = '2.0.1'
 from file import *
 from format import *
 from html import *
 from iso import *
 from text import *
 from form import *
 import cache
 import net
 from file import *
 from form import *
 from format import *
 from html import *
 from image import *
 from js import *
 from location import *
 from normalize import *
 from text import *
 from torrent import *
 import location
--- a/ox/js.py
+++ b/ox/js.py
@ -0,0 +1,145 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 def minify(source, comment=''):
    # see https://github.com/douglascrockford/JSMin/blob/master/README
    def get_next_non_whitespace_token():
        pass
    tokens = tokenize(source)
    length = len(tokens)
    minified = '/*' + comment + '*/' if comment else ''
    for i, token in enumerate(tokens):
        if token['type'] in ['linebreak', 'whitespace']:
            prevToken = None if i == 0 else tokens[i - 1]
            next = i + 1
            while next < length and tokens[next]['type'] in ['comment', 'linebreak', 'whitespace']:
                next += 1
            nextToken = None if next == length else tokens[next]            
        if token['type'] == 'linebreak':
            # replace a linebreak between two tokens that are identifiers or
            # numbers or strings or unary operators or grouping operators
            # with a single newline, otherwise remove it
            if prevToken and nextToken\
                    and (prevToken['type'] in ['identifier', 'number', 'string']\
                        or prevToken['value'] in ['++', '--', ')', ']', '}'])\
                    and (nextToken['type'] in ['identifier', 'number', 'string']\
                        or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']):
                minified += '\n'
        elif token['type'] == 'whitespace':
            # replace whitespace between two tokens that are identifiers or
            # numbers, or between a token that ends with "+" or "-" and one that
            # begins with "+" or "-", with a single space, otherwise remove it
            if prevToken and nextToken\
                    and ((prevToken['type'] in ['identifier', 'number']\
                        and nextToken['type'] in ['identifier', 'number'])
                    or (prevToken['value'] in ['+', '-', '++', '--']
                        and nextToken['value'] in ['+', '-', '++', '--'])):
                minified += ' '
        elif token['type'] != 'comment':
            # remove comments and leave all other tokens untouched
            minified += token['value']
    return minified
 def tokenize(source):
    # see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js
    IDENTIFIER = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_'
    LINEBREAK = '\n\r'
    NUMBER = '01234567890'
    OPERATOR = [
        # arithmetic
        '+', '-', '*', '/', '%', '++', '--',
        # assignment
        '=', '+=', '-=', '*=', '/=', '%=',
        '&=', '|=', '^=', '<<=', '>>=', '>>>=',
        # bitwise
        '&', '|', '^', '~', '<<', '>>', '>>>',
        # comparison
        '==', '!=', '===', '!==', '>', '>=', '<', '<=',
        # conditional
        '?', ':',
        # grouping
        '(', ')', '[', ']', '{', '}',
        # logical
        '&&', '||', '!',
        # other
        '.', ',', ';'
    ]
    REGEXP = 'abcdefghijklmnopqrstuvwxyz'
    STRING = '\'"'
    WHITESPACE = ' \t'
    def is_regexp():
        # checks if a forward slash is the beginning of a regexp,
        # as opposed to the beginning of an operator
        i = len(tokens) - 1
        # scan back to the previous significant token,
        # or to the beginnig of the source
        while i >= 0 and tokens[i]['type'] in ['comment', 'linebreak', 'whitespace']:
            i -= 1
        if i == -1:
            # source begins with forward slash
            is_regexp = True
        else:
            token = tokens[i]
            is_regexp = (
                token['type'] == 'keyword' and not token['value'] in ['false', 'null', 'true']
            ) or (
                token['type'] == 'operator' and not token['value'] in ['++', '--', ')', ']', '}']
            )
        return is_regexp
    cursor = 0
    length = len(source)
    tokens = []
    while cursor < length:
        char = source[cursor]
        start = cursor
        cursor += 1
        if char == '/' and cursor < length - 1 and source[cursor] in '/*':
            type = 'comment'
            cursor += 1
            while cursor < length:
                cursor += 1
                if source[start + 1] == '/' and source[cursor] == '\n':
                    break
                elif source[start + 1] == '*' and source[cursor:cursor + 2] == '*/':
                    cursor += 2
                    break
        elif char in IDENTIFIER:
            type = 'identifier'
            while cursor < length and source[cursor] in IDENTIFIER + NUMBER:
                cursor += 1
        elif char in LINEBREAK:
            type = 'linebreak'
            while cursor < length and source[cursor] in LINEBREAK:
                cursor += 1
        elif char in NUMBER:
            type = 'number'
            while cursor < length and source[cursor] in NUMBER + '.':
                cursor += 1
        elif char == '/' and is_regexp():
            type = 'regexp'
            while cursor < length and source[cursor] != '/':
                cursor += (2 if source[cursor] == '\\' else 1)
            cursor += 1
            while cursor < length and source[cursor] in REGEXP:
                cursor += 1
        elif char in OPERATOR:
            type = 'operator'
            string = char + source[cursor]
            while cursor < length and string in OPERATOR:
                cursor += 1
                string += source[cursor]
        elif char in STRING:
            type = 'string'
            while cursor < length and source[cursor] != source[start]:
                cursor += (2 if source[cursor] == '\\' else 1)
            cursor += 1
        elif char in WHITESPACE:
            type = 'whitespace'
            while cursor < length and source[cursor] in WHITESPACE:
                cursor += 1
        tokens.append({
            'type': type,
            'value': source[start:cursor]
        })
    return tokens