oxjs/tools/unicode/unicode.py

import json
import ox
import re

ascii = {}

chars = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
special = {
    'ACCOUNT OF': 'A/O',
    'ANGSTROM SIGN': 'A',
    'CARE OF': 'C/O',
    'EIGHT': '8',
    'ELEVEN': '11',
    'EULER CONSTANT': 'E',
    'FIFTY': '50',
    'FIVE HUNDRED': '500',
    'FIVE': '5',
    'FOUR': '4',
    'GHZ': 'GHZ',
    'KELVIN SIGN': 'K',
    'KM CUBED': 'KM3',
    'KM SQUARED': 'KM2',
    'M CUBED': 'M3',
    'M SQUARED': 'M2',
    'MHZ': 'MHZ',
    'MM CUBED': 'MM3',
    'MM SQUARED': 'MM2',
    'MOL': 'MOL',
    'MPA': 'MPA',
    'NINE': '9',
    'ONE THOUSAND': '1000',
    'ONE HUNDRED': '100',
    'ONE': '1',
    'SEVEN': '7',
    'SIX': '6',
    'TELEPHONE SIGN': 'TEL',
    'TEN': '10',
    'THREE': '3',
    'THZ': 'THZ',
    'TRADE MARK SIGN': 'TM',
    'TWELVE': '12',
    'TWO': '2'
}

html = ox.cache.readUrlUnicode('http://unicode.org/charts/uca/chart_Latin.html')

results = re.compile("title='(.+):.+<tt>(.+)</tt>").findall(html)
for result in results:
    text = result[0].replace(' DIGRAPH', '')
    code = result[1]
    char = ''
    if ' WITH' in text:
        text = text.split(' WITH')[0]
    last = text.split(' ')[-1]
    if len(last) <= 2:
        char = last
    else:
        for string in special:
            if text.endswith(string):
                char = special[string]
                break
    if char:
        if not char in ascii:
            ascii[result[1]] = char
    else:
        print result[0]

f = open('../../source/Ox.Unicode/json/Ox.Unicode.json', 'w')
f.write(json.dumps(ascii, indent=4, sort_keys=True))
f.close()
remove build tree 2011-10-27 18:54:34 +00:00			`import json`
			`import ox`
			`import re`

			`ascii = {}`

			`chars = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')`
			`special = {`
			`'ACCOUNT OF': 'A/O',`
			`'ANGSTROM SIGN': 'A',`
			`'CARE OF': 'C/O',`
			`'EIGHT': '8',`
			`'ELEVEN': '11',`
			`'EULER CONSTANT': 'E',`
			`'FIFTY': '50',`
			`'FIVE HUNDRED': '500',`
			`'FIVE': '5',`
			`'FOUR': '4',`
			`'GHZ': 'GHZ',`
			`'KELVIN SIGN': 'K',`
			`'KM CUBED': 'KM3',`
			`'KM SQUARED': 'KM2',`
			`'M CUBED': 'M3',`
			`'M SQUARED': 'M2',`
			`'MHZ': 'MHZ',`
			`'MM CUBED': 'MM3',`
			`'MM SQUARED': 'MM2',`
			`'MOL': 'MOL',`
			`'MPA': 'MPA',`
			`'NINE': '9',`
			`'ONE THOUSAND': '1000',`
			`'ONE HUNDRED': '100',`
			`'ONE': '1',`
			`'SEVEN': '7',`
			`'SIX': '6',`
			`'TELEPHONE SIGN': 'TEL',`
			`'TEN': '10',`
			`'THREE': '3',`
			`'THZ': 'THZ',`
			`'TRADE MARK SIGN': 'TM',`
			`'TWELVE': '12',`
			`'TWO': '2'`
			`}`

			`html = ox.cache.readUrlUnicode('http://unicode.org/charts/uca/chart_Latin.html')`

			`results = re.compile("title='(.+):.+<tt>(.+)</tt>").findall(html)`
			`for result in results:`
			`text = result[0].replace(' DIGRAPH', '')`
			`code = result[1]`
			`char = ''`
			`if ' WITH' in text:`
			`text = text.split(' WITH')[0]`
			`last = text.split(' ')[-1]`
			`if len(last) <= 2:`
			`char = last`
			`else:`
			`for string in special:`
			`if text.endswith(string):`
			`char = special[string]`
			`break`
			`if char:`
			`if not char in ascii:`
			`ascii[result[1]] = char`
			`else:`
			`print result[0]`

			`f = open('../../source/Ox.Unicode/json/Ox.Unicode.json', 'w')`
			`f.write(json.dumps(ascii, indent=4, sort_keys=True))`
			`f.close()`