oxjs/tools/unicode/unicode.py

70 lines
1.6 KiB
Python
Raw Normal View History

2011-10-27 18:54:34 +00:00
import json
import ox
import re
ascii = {}
chars = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
special = {
'ACCOUNT OF': 'A/O',
'ANGSTROM SIGN': 'A',
'CARE OF': 'C/O',
'EIGHT': '8',
'ELEVEN': '11',
'EULER CONSTANT': 'E',
'FIFTY': '50',
'FIVE HUNDRED': '500',
'FIVE': '5',
'FOUR': '4',
'GHZ': 'GHZ',
'KELVIN SIGN': 'K',
'KM CUBED': 'KM3',
'KM SQUARED': 'KM2',
'M CUBED': 'M3',
'M SQUARED': 'M2',
'MHZ': 'MHZ',
'MM CUBED': 'MM3',
'MM SQUARED': 'MM2',
'MOL': 'MOL',
'MPA': 'MPA',
'NINE': '9',
'ONE THOUSAND': '1000',
'ONE HUNDRED': '100',
'ONE': '1',
'SEVEN': '7',
'SIX': '6',
'TELEPHONE SIGN': 'TEL',
'TEN': '10',
'THREE': '3',
'THZ': 'THZ',
'TRADE MARK SIGN': 'TM',
'TWELVE': '12',
'TWO': '2'
}
html = ox.cache.readUrlUnicode('http://unicode.org/charts/uca/chart_Latin.html')
results = re.compile("title='(.+):.+<tt>(.+)</tt>").findall(html)
for result in results:
text = result[0].replace(' DIGRAPH', '')
code = result[1]
char = ''
if ' WITH' in text:
text = text.split(' WITH')[0]
last = text.split(' ')[-1]
if len(last) <= 2:
char = last
else:
for string in special:
if text.endswith(string):
char = special[string]
break
if char:
if not char in ascii:
ascii[result[1]] = char
else:
print result[0]
f = open('../../source/Ox.Unicode/json/Ox.Unicode.json', 'w')
f.write(json.dumps(ascii, indent=4, sort_keys=True))
f.close()