import json import ox import re ascii = {} chars = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') special = { 'ACCOUNT OF': 'A/O', 'ANGSTROM SIGN': 'A', 'CARE OF': 'C/O', 'EIGHT': '8', 'ELEVEN': '11', 'EULER CONSTANT': 'E', 'FIFTY': '50', 'FIVE HUNDRED': '500', 'FIVE': '5', 'FOUR': '4', 'GHZ': 'GHZ', 'KELVIN SIGN': 'K', 'KM CUBED': 'KM3', 'KM SQUARED': 'KM2', 'M CUBED': 'M3', 'M SQUARED': 'M2', 'MHZ': 'MHZ', 'MM CUBED': 'MM3', 'MM SQUARED': 'MM2', 'MOL': 'MOL', 'MPA': 'MPA', 'NINE': '9', 'ONE THOUSAND': '1000', 'ONE HUNDRED': '100', 'ONE': '1', 'SEVEN': '7', 'SIX': '6', 'TELEPHONE SIGN': 'TEL', 'TEN': '10', 'THREE': '3', 'THZ': 'THZ', 'TRADE MARK SIGN': 'TM', 'TWELVE': '12', 'TWO': '2' } html = ox.cache.readUrlUnicode('http://unicode.org/charts/uca/chart_Latin.html') results = re.compile("title='(.+):.+(.+)").findall(html) for result in results: text = result[0].replace(' DIGRAPH', '') code = result[1] char = '' if ' WITH' in text: text = text.split(' WITH')[0] last = text.split(' ')[-1] if len(last) <= 2: char = last else: for string in special: if text.endswith(string): char = special[string] break if char: if not char in ascii: ascii[result[1]] = char else: print result[0] f = open('../../source/Ox.Unicode/json/Ox.Unicode.json', 'w') f.write(json.dumps(ascii, indent=4, sort_keys=True)) f.close()