add js module (tokenize, minify)

2011-10-06 22:05:01 +02:00 · 2011-10-06 22:05:01 +02:00 · 409d5da90d
commit 409d5da90d
parent 0050c403ab
2 changed files with 157 additions and 11 deletions
--- a/ox/init.py
+++ b/ox/init.py
@ -1,17 +1,18 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
-# GPL 2008
-__version__ = '2.0.0'
+# GPL 2011
+__version__ = '2.0.1'

-from file import *
-from format import *
-from html import *
-from iso import *
-from text import *
-from form import *
 import cache
 import net

-from torrent import *
-
-import location
+from file import *
+from form import *
+from format import *
+from html import *
+from image import *
+from js import *
+from location import *
+from normalize import *
+from text import *
+from torrent import *
--- a/ox/js.py
+++ b/ox/js.py
@ -0,0 +1,145 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+
+def minify(source, comment=''):
+    # see https://github.com/douglascrockford/JSMin/blob/master/README
+    def get_next_non_whitespace_token():
+        pass
+    tokens = tokenize(source)
+    length = len(tokens)
+    minified = '/*' + comment + '*/' if comment else ''
+    for i, token in enumerate(tokens):
+        if token['type'] in ['linebreak', 'whitespace']:
+            prevToken = None if i == 0 else tokens[i - 1]
+            next = i + 1
+            while next < length and tokens[next]['type'] in ['comment', 'linebreak', 'whitespace']:
+                next += 1
+            nextToken = None if next == length else tokens[next]            
+        if token['type'] == 'linebreak':
+            # replace a linebreak between two tokens that are identifiers or
+            # numbers or strings or unary operators or grouping operators
+            # with a single newline, otherwise remove it
+            if prevToken and nextToken\
+                    and (prevToken['type'] in ['identifier', 'number', 'string']\
+                        or prevToken['value'] in ['++', '--', ')', ']', '}'])\
+                    and (nextToken['type'] in ['identifier', 'number', 'string']\
+                        or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']):
+                minified += '\n'
+        elif token['type'] == 'whitespace':
+            # replace whitespace between two tokens that are identifiers or
+            # numbers, or between a token that ends with "+" or "-" and one that
+            # begins with "+" or "-", with a single space, otherwise remove it
+            if prevToken and nextToken\
+                    and ((prevToken['type'] in ['identifier', 'number']\
+                        and nextToken['type'] in ['identifier', 'number'])
+                    or (prevToken['value'] in ['+', '-', '++', '--']
+                        and nextToken['value'] in ['+', '-', '++', '--'])):
+                minified += ' '
+        elif token['type'] != 'comment':
+            # remove comments and leave all other tokens untouched
+            minified += token['value']
+    return minified
+
+def tokenize(source):
+    # see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js
+    IDENTIFIER = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_'
+    LINEBREAK = '\n\r'
+    NUMBER = '01234567890'
+    OPERATOR = [
+        # arithmetic
+        '+', '-', '*', '/', '%', '++', '--',
+        # assignment
+        '=', '+=', '-=', '*=', '/=', '%=',
+        '&=', '|=', '^=', '<<=', '>>=', '>>>=',
+        # bitwise
+        '&', '|', '^', '~', '<<', '>>', '>>>',
+        # comparison
+        '==', '!=', '===', '!==', '>', '>=', '<', '<=',
+        # conditional
+        '?', ':',
+        # grouping
+        '(', ')', '[', ']', '{', '}',
+        # logical
+        '&&', '||', '!',
+        # other
+        '.', ',', ';'
+    ]
+    REGEXP = 'abcdefghijklmnopqrstuvwxyz'
+    STRING = '\'"'
+    WHITESPACE = ' \t'
+    def is_regexp():
+        # checks if a forward slash is the beginning of a regexp,
+        # as opposed to the beginning of an operator
+        i = len(tokens) - 1
+        # scan back to the previous significant token,
+        # or to the beginnig of the source
+        while i >= 0 and tokens[i]['type'] in ['comment', 'linebreak', 'whitespace']:
+            i -= 1
+        if i == -1:
+            # source begins with forward slash
+            is_regexp = True
+        else:
+            token = tokens[i]
+            is_regexp = (
+                token['type'] == 'keyword' and not token['value'] in ['false', 'null', 'true']
+            ) or (
+                token['type'] == 'operator' and not token['value'] in ['++', '--', ')', ']', '}']
+            )
+        return is_regexp
+    cursor = 0
+    length = len(source)
+    tokens = []
+    while cursor < length:
+        char = source[cursor]
+        start = cursor
+        cursor += 1
+        if char == '/' and cursor < length - 1 and source[cursor] in '/*':
+            type = 'comment'
+            cursor += 1
+            while cursor < length:
+                cursor += 1
+                if source[start + 1] == '/' and source[cursor] == '\n':
+                    break
+                elif source[start + 1] == '*' and source[cursor:cursor + 2] == '*/':
+                    cursor += 2
+                    break
+        elif char in IDENTIFIER:
+            type = 'identifier'
+            while cursor < length and source[cursor] in IDENTIFIER + NUMBER:
+                cursor += 1
+        elif char in LINEBREAK:
+            type = 'linebreak'
+            while cursor < length and source[cursor] in LINEBREAK:
+                cursor += 1
+        elif char in NUMBER:
+            type = 'number'
+            while cursor < length and source[cursor] in NUMBER + '.':
+                cursor += 1
+        elif char == '/' and is_regexp():
+            type = 'regexp'
+            while cursor < length and source[cursor] != '/':
+                cursor += (2 if source[cursor] == '\\' else 1)
+            cursor += 1
+            while cursor < length and source[cursor] in REGEXP:
+                cursor += 1
+        elif char in OPERATOR:
+            type = 'operator'
+            string = char + source[cursor]
+            while cursor < length and string in OPERATOR:
+                cursor += 1
+                string += source[cursor]
+        elif char in STRING:
+            type = 'string'
+            while cursor < length and source[cursor] != source[start]:
+                cursor += (2 if source[cursor] == '\\' else 1)
+            cursor += 1
+        elif char in WHITESPACE:
+            type = 'whitespace'
+            while cursor < length and source[cursor] in WHITESPACE:
+                cursor += 1
+        tokens.append({
+            'type': type,
+            'value': source[start:cursor]
+        })
+    return tokens