oxjs/tools/unicode/unicode.py

import json
import ox
import re

ascii = {}

chars = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
special = {
    'ACCOUNT OF': 'A/O',
    'ANGSTROM SIGN': 'A',
    'CARE OF': 'C/O',
    'EIGHT': '8',
    'ELEVEN': '11',
    'EULER CONSTANT': 'E',
    'FIFTY': '50',
    'FIVE HUNDRED': '500',
    'FIVE': '5',
    'FOUR': '4',
    'GHZ': 'GHZ',
    'KELVIN SIGN': 'K',
    'KM CUBED': 'KM3',
    'KM SQUARED': 'KM2',
    'M CUBED': 'M3',
    'M SQUARED': 'M2',
    'MHZ': 'MHZ',
    'MM CUBED': 'MM3',
    'MM SQUARED': 'MM2',
    'MOL': 'MOL',
    'MPA': 'MPA',
    'NINE': '9',
    'ONE THOUSAND': '1000',
    'ONE HUNDRED': '100',
    'ONE': '1',
    'SEVEN': '7',
    'SIX': '6',
    'TELEPHONE SIGN': 'TEL',
    'TEN': '10',
    'THREE': '3',
    'THZ': 'THZ',
    'TRADE MARK SIGN': 'TM',
    'TWELVE': '12',
    'TWO': '2'
}

html = ox.cache.readUrlUnicode('http://unicode.org/charts/uca/chart_Latin.html')

results = re.compile("title='(.+):.+<tt>(.+)</tt>").findall(html)
for result in results:
    text = result[0].replace(' DIGRAPH', '')
    code = result[1]
    char = ''
    if ' WITH' in text:
        text = text.split(' WITH')[0]
    last = text.split(' ')[-1]
    if len(last) <= 2:
        char = last
    else:
        for string in special:
            if text.endswith(string):
                char = special[string]
                break
    if char:
        if not char in ascii:
            ascii[result[1]] = char
    else:
        print result[0]

f = open('../../source/Ox.Unicode/json/Ox.Unicode.json', 'w')
f.write(json.dumps(ascii, indent=4, sort_keys=True))
f.close()