replace unicode.py, again

2011-10-29 19:17:14 +02:00 · 2011-10-29 19:17:14 +02:00 · 17b9d0ff73
commit 17b9d0ff73
parent b6c872b5a4
2 changed files with 140 additions and 40 deletions
--- a/source/Ox.UI/themes/classic/_index.html
+++ b/source/Ox.UI/themes/classic/_index.html
@ -40,8 +40,8 @@
                height: 256px;
            }
        </style>
-        <script src="../../../../build/jquery/jquery.js"></script>
-        <script src="../../../Ox.js"></script>
+        <script src="../../../../build/Ox.UI/jquery/jquery.js"></script>
+        <script src="../../../../build/Ox.js"></script>
        <script>
            $(function() {
                var $body = $('body')
--- a/tools/unicode/unicode.py
+++ b/tools/unicode/unicode.py
@ -2,69 +2,169 @@ import json
 import ox
 import re

-ascii = {}
-
-chars = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+remove = [
+    'ABOVE', 'BAR', 'BELOW', 'CEDILLA', 'DIGRAPH', 'LONGA', 'ROTUNDA'
+]
 special = {
-    'ACCOUNT OF': 'A/O',
+    'ACCOUNT OF': 'a/o',
+    'ADDRESSED TO THE SUBJECT': 'a/s',
+    'AM': 'a.m.',
    'ANGSTROM SIGN': 'A',
-    'CARE OF': 'C/O',
+    'C OVER KG': 'c/kg',
+    'CADA UNA': 'c/u',
+    'CARE OF': 'c/o',
+    'CM CUBED': 'cm3',
+    'CM SQUARED': 'cm2',
+    'CO': 'Co.',
+    'DM CUBED': 'dm3',
+    'DM SQUARED': 'dm2',
    'EIGHT': '8',
    'ELEVEN': '11',
    'EULER CONSTANT': 'E',
+    'FACSIMILE SIGN': 'FAX',
+    'FEMININE ORDINAL INDICATOR': 'a',
    'FIFTY': '50',
-    'FIVE HUNDRED': '500',
    'FIVE': '5',
+    'FIVE HUNDRED': '500',
    'FOUR': '4',
-    'GHZ': 'GHZ',
+    'INFORMATION SOURCE': 'I',
+    'KCAL': 'kcal',
    'KELVIN SIGN': 'K',
-    'KM CUBED': 'KM3',
-    'KM SQUARED': 'KM2',
-    'M CUBED': 'M3',
-    'M SQUARED': 'M2',
-    'MHZ': 'MHZ',
-    'MM CUBED': 'MM3',
-    'MM SQUARED': 'MM2',
-    'MOL': 'MOL',
-    'MPA': 'MPA',
+    'KK': 'K.K.',
+    'KM CAPITAL': 'KM',
+    'KM CUBED': 'km3',
+    'KM SQUARED': 'km2',
+    'LATIN SMALL LETTER N PRECEDED BY APOSTROPHE': '\'n',
+    'LIMITED LIABILITY SIGN': 'LTD',
+    'M CUBED': 'm3',
+    'M OVER S SQUARED': 'm/s2',
+    'M SQUARED': 'm2',
+    'MASCULINE ORDINAL INDICATOR': 'o',
+    'MB SMALL': 'mb',
+    'MM CUBED': 'mm3',
+    'MM SQUARED': 'mm2',
+    'MV MEGA': 'MV',
+    'MW MEGA': 'MW',
    'NINE': '9',
-    'ONE THOUSAND': '1000',
-    'ONE HUNDRED': '100',
+    'NUMERO SIGN': 'No',
    'ONE': '1',
+    'ONE HUNDRED': '100',
+    'ONE THOUSAND': '1000',
+    'PA AMPS': 'pA',
+    'PARTNERSHIP SIGN': 'PTE',
+    'PLANCK CONSTANT': 'h',
+    'PLANCK CONSTANT OVER PI': 'h',
+    'PLANCK CONSTANT OVER TWO PI': 'h',
+    'PM': 'p.m.',
+    'RAD OVER S SQUARED': 'rad/s2',
+    'RUPEE SIGN': 'Rs',
+    'S T': 'st',
+    'SERVICE MARK': 'SM',
    'SEVEN': '7',
    'SIX': '6',
    'TELEPHONE SIGN': 'TEL',
    'TEN': '10',
    'THREE': '3',
-    'THZ': 'THZ',
    'TRADE MARK SIGN': 'TM',
    'TWELVE': '12',
    'TWO': '2'
 }
+special_keys = sorted(special.keys(), key=lambda x: -len(x))
+units = [
+    'bar', 'Bq',
+    'cal', 'cd', 'cm',
+    'da', 'dB', 'dl',
+    'ffi', 'ffl', 'fm',
+    'GHz', 'GPa', 'Gy',
+    'ha', 'hPa', 'Hz',
+    'in',
+    'kA', 'kcal', 'kg', 'KHz', 'kl', 'km', 'KPa', 'kt', 'kV', 'kW',
+    'log', 'lm', 'ln', 'lx',
+    'mA', 'mg', 'MHz', 'mil', 'ml', 'mm', 'mol', 'MPa', 'ms', 'mV', 'mW',
+    'nA', 'nF', 'nm', 'ns', 'nV', 'nW',
+    'oV',
+    'Pa', 'pc', 'pH', 'PPM', 'ps', 'pV', 'pW',
+    'rad',
+    'sr', 'Sv',
+    'THz',
+    'wb'
+]
+
+txt = ox.cache.readUrlUnicode('http://unicode.org/Public/UNIDATA/NamesList.txt')
+lines = txt.split('\n')
+length = len(lines)
+chars = {}
+sections = []
+types = []
+for i, line in enumerate(lines):
+    results = re.compile('^@@\t[0-9A-Z]{4}\t(.+)\t[0-9A-Z]{4}').findall(line)
+    if results:
+        # section
+        section = results[0].upper()
+        sections.append(section)
+    else:
+        results = re.compile('^@\t\t(.+)').findall(line)
+        if results:
+            # type
+            type = results[0].upper()
+            types.append(type)
+        else:
+            results = re.compile('^([0-9A-Z]{4})\t(.+)').findall(line)
+            if results:
+                # char + name
+                char = unichr(int(results[0][0], 16))
+                name = results[0][1]
+                chars[char] = {
+                    'names': [] if name[0] == '<' else [name],
+                    'section': section,
+                    'type': type
+                }
+                if char == '\uFFFF':
+                    break
+            else:
+                results = re.compile('^\t= (.+)').findall(line)
+                if results:
+                    # name
+                    for name in results[0].upper().split(', '):
+                        chars[char]['names'].append(name)
        
 html = ox.cache.readUrlUnicode('http://unicode.org/charts/uca/chart_Latin.html')
-
-results = re.compile("title='(.+):.+<tt>(.+)</tt>").findall(html)
+results = re.compile("title='(.+):.+<tt>([0-9A-Z]{4})</tt>").findall(html)
+no_ascii = []
 for result in results:
-    text = result[0].replace(' DIGRAPH', '')
    code = result[1]
-    char = ''
-    if ' WITH' in text:
-        text = text.split(' WITH')[0]
-    last = text.split(' ')[-1]
-    if len(last) <= 2:
-        char = last
-    else:
-        for string in special:
-            if text.endswith(string):
-                char = special[string]
+    if int(code, 16) > 127:
+        char = unichr(int(code, 16))
+        name = result[0]
+        words = name.split(' ')
+        ascii = ''
+        for key in special_keys:
+            if name == key or name.endswith(' ' + key):
+                ascii = special[key]
                break
-    if char:
-        if not char in ascii:
-            ascii[result[1]] = char
+        if not ascii:
+            for unit in units:
+                if words[-1] == unit.upper():
+                    ascii = unit
+                    break;
+            if not ascii:
+                name = re.sub(' WITH .+', '', name)
+                for word in remove:
+                    name = re.sub(' ' + word, '', name)
+                words = name.split(' ')
+                if len(words[-1]) <= 2:
+                    ascii = words[-1]
                else:
-        print result[0]
+                    no_ascii.append(name)
+        if ascii:
+            if 'SMALL' in words and not 'CAPITAL' in words:
+                ascii = ascii.lower()
+            chars[char]['ascii'] = ascii

 f = open('../../source/Ox.Unicode/json/Ox.Unicode.json', 'w')
-f.write(json.dumps(ascii, indent=4, sort_keys=True))
+f.write(json.dumps(chars, indent=4, sort_keys=True))
+f.close()
+
+f = open('json/no_ascii.json', 'w')
+f.write(json.dumps(sorted(no_ascii), indent=4))
 f.close()