70 lines
No EOL
1.6 KiB
Python
70 lines
No EOL
1.6 KiB
Python
import json
|
|
import ox
|
|
import re
|
|
|
|
ascii = {}
|
|
|
|
chars = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
|
|
special = {
|
|
'ACCOUNT OF': 'A/O',
|
|
'ANGSTROM SIGN': 'A',
|
|
'CARE OF': 'C/O',
|
|
'EIGHT': '8',
|
|
'ELEVEN': '11',
|
|
'EULER CONSTANT': 'E',
|
|
'FIFTY': '50',
|
|
'FIVE HUNDRED': '500',
|
|
'FIVE': '5',
|
|
'FOUR': '4',
|
|
'GHZ': 'GHZ',
|
|
'KELVIN SIGN': 'K',
|
|
'KM CUBED': 'KM3',
|
|
'KM SQUARED': 'KM2',
|
|
'M CUBED': 'M3',
|
|
'M SQUARED': 'M2',
|
|
'MHZ': 'MHZ',
|
|
'MM CUBED': 'MM3',
|
|
'MM SQUARED': 'MM2',
|
|
'MOL': 'MOL',
|
|
'MPA': 'MPA',
|
|
'NINE': '9',
|
|
'ONE THOUSAND': '1000',
|
|
'ONE HUNDRED': '100',
|
|
'ONE': '1',
|
|
'SEVEN': '7',
|
|
'SIX': '6',
|
|
'TELEPHONE SIGN': 'TEL',
|
|
'TEN': '10',
|
|
'THREE': '3',
|
|
'THZ': 'THZ',
|
|
'TRADE MARK SIGN': 'TM',
|
|
'TWELVE': '12',
|
|
'TWO': '2'
|
|
}
|
|
|
|
html = ox.cache.readUrlUnicode('http://unicode.org/charts/uca/chart_Latin.html')
|
|
|
|
results = re.compile("title='(.+):.+<tt>(.+)</tt>").findall(html)
|
|
for result in results:
|
|
text = result[0].replace(' DIGRAPH', '')
|
|
code = result[1]
|
|
char = ''
|
|
if ' WITH' in text:
|
|
text = text.split(' WITH')[0]
|
|
last = text.split(' ')[-1]
|
|
if len(last) <= 2:
|
|
char = last
|
|
else:
|
|
for string in special:
|
|
if text.endswith(string):
|
|
char = special[string]
|
|
break
|
|
if char:
|
|
if not char in ascii:
|
|
ascii[result[1]] = char
|
|
else:
|
|
print result[0]
|
|
|
|
f = open('../../source/Ox.Unicode/json/Ox.Unicode.json', 'w')
|
|
f.write(json.dumps(ascii, indent=4, sort_keys=True))
|
|
f.close() |