replace unicode.py, again

This commit is contained in:
rolux 2011-10-29 19:17:14 +02:00
parent b6c872b5a4
commit 17b9d0ff73
2 changed files with 140 additions and 40 deletions

View file

@ -40,8 +40,8 @@
height: 256px; height: 256px;
} }
</style> </style>
<script src="../../../../build/jquery/jquery.js"></script> <script src="../../../../build/Ox.UI/jquery/jquery.js"></script>
<script src="../../../Ox.js"></script> <script src="../../../../build/Ox.js"></script>
<script> <script>
$(function() { $(function() {
var $body = $('body') var $body = $('body')

View file

@ -2,69 +2,169 @@ import json
import ox import ox
import re import re
ascii = {} remove = [
'ABOVE', 'BAR', 'BELOW', 'CEDILLA', 'DIGRAPH', 'LONGA', 'ROTUNDA'
chars = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') ]
special = { special = {
'ACCOUNT OF': 'A/O', 'ACCOUNT OF': 'a/o',
'ADDRESSED TO THE SUBJECT': 'a/s',
'AM': 'a.m.',
'ANGSTROM SIGN': 'A', 'ANGSTROM SIGN': 'A',
'CARE OF': 'C/O', 'C OVER KG': 'c/kg',
'CADA UNA': 'c/u',
'CARE OF': 'c/o',
'CM CUBED': 'cm3',
'CM SQUARED': 'cm2',
'CO': 'Co.',
'DM CUBED': 'dm3',
'DM SQUARED': 'dm2',
'EIGHT': '8', 'EIGHT': '8',
'ELEVEN': '11', 'ELEVEN': '11',
'EULER CONSTANT': 'E', 'EULER CONSTANT': 'E',
'FACSIMILE SIGN': 'FAX',
'FEMININE ORDINAL INDICATOR': 'a',
'FIFTY': '50', 'FIFTY': '50',
'FIVE HUNDRED': '500',
'FIVE': '5', 'FIVE': '5',
'FIVE HUNDRED': '500',
'FOUR': '4', 'FOUR': '4',
'GHZ': 'GHZ', 'INFORMATION SOURCE': 'I',
'KCAL': 'kcal',
'KELVIN SIGN': 'K', 'KELVIN SIGN': 'K',
'KM CUBED': 'KM3', 'KK': 'K.K.',
'KM SQUARED': 'KM2', 'KM CAPITAL': 'KM',
'M CUBED': 'M3', 'KM CUBED': 'km3',
'M SQUARED': 'M2', 'KM SQUARED': 'km2',
'MHZ': 'MHZ', 'LATIN SMALL LETTER N PRECEDED BY APOSTROPHE': '\'n',
'MM CUBED': 'MM3', 'LIMITED LIABILITY SIGN': 'LTD',
'MM SQUARED': 'MM2', 'M CUBED': 'm3',
'MOL': 'MOL', 'M OVER S SQUARED': 'm/s2',
'MPA': 'MPA', 'M SQUARED': 'm2',
'MASCULINE ORDINAL INDICATOR': 'o',
'MB SMALL': 'mb',
'MM CUBED': 'mm3',
'MM SQUARED': 'mm2',
'MV MEGA': 'MV',
'MW MEGA': 'MW',
'NINE': '9', 'NINE': '9',
'ONE THOUSAND': '1000', 'NUMERO SIGN': 'No',
'ONE HUNDRED': '100',
'ONE': '1', 'ONE': '1',
'ONE HUNDRED': '100',
'ONE THOUSAND': '1000',
'PA AMPS': 'pA',
'PARTNERSHIP SIGN': 'PTE',
'PLANCK CONSTANT': 'h',
'PLANCK CONSTANT OVER PI': 'h',
'PLANCK CONSTANT OVER TWO PI': 'h',
'PM': 'p.m.',
'RAD OVER S SQUARED': 'rad/s2',
'RUPEE SIGN': 'Rs',
'S T': 'st',
'SERVICE MARK': 'SM',
'SEVEN': '7', 'SEVEN': '7',
'SIX': '6', 'SIX': '6',
'TELEPHONE SIGN': 'TEL', 'TELEPHONE SIGN': 'TEL',
'TEN': '10', 'TEN': '10',
'THREE': '3', 'THREE': '3',
'THZ': 'THZ',
'TRADE MARK SIGN': 'TM', 'TRADE MARK SIGN': 'TM',
'TWELVE': '12', 'TWELVE': '12',
'TWO': '2' 'TWO': '2'
} }
special_keys = sorted(special.keys(), key=lambda x: -len(x))
units = [
'bar', 'Bq',
'cal', 'cd', 'cm',
'da', 'dB', 'dl',
'ffi', 'ffl', 'fm',
'GHz', 'GPa', 'Gy',
'ha', 'hPa', 'Hz',
'in',
'kA', 'kcal', 'kg', 'KHz', 'kl', 'km', 'KPa', 'kt', 'kV', 'kW',
'log', 'lm', 'ln', 'lx',
'mA', 'mg', 'MHz', 'mil', 'ml', 'mm', 'mol', 'MPa', 'ms', 'mV', 'mW',
'nA', 'nF', 'nm', 'ns', 'nV', 'nW',
'oV',
'Pa', 'pc', 'pH', 'PPM', 'ps', 'pV', 'pW',
'rad',
'sr', 'Sv',
'THz',
'wb'
]
txt = ox.cache.readUrlUnicode('http://unicode.org/Public/UNIDATA/NamesList.txt')
lines = txt.split('\n')
length = len(lines)
chars = {}
sections = []
types = []
for i, line in enumerate(lines):
results = re.compile('^@@\t[0-9A-Z]{4}\t(.+)\t[0-9A-Z]{4}').findall(line)
if results:
# section
section = results[0].upper()
sections.append(section)
else:
results = re.compile('^@\t\t(.+)').findall(line)
if results:
# type
type = results[0].upper()
types.append(type)
else:
results = re.compile('^([0-9A-Z]{4})\t(.+)').findall(line)
if results:
# char + name
char = unichr(int(results[0][0], 16))
name = results[0][1]
chars[char] = {
'names': [] if name[0] == '<' else [name],
'section': section,
'type': type
}
if char == '\uFFFF':
break
else:
results = re.compile('^\t= (.+)').findall(line)
if results:
# name
for name in results[0].upper().split(', '):
chars[char]['names'].append(name)
html = ox.cache.readUrlUnicode('http://unicode.org/charts/uca/chart_Latin.html') html = ox.cache.readUrlUnicode('http://unicode.org/charts/uca/chart_Latin.html')
results = re.compile("title='(.+):.+<tt>([0-9A-Z]{4})</tt>").findall(html)
results = re.compile("title='(.+):.+<tt>(.+)</tt>").findall(html) no_ascii = []
for result in results: for result in results:
text = result[0].replace(' DIGRAPH', '')
code = result[1] code = result[1]
char = '' if int(code, 16) > 127:
if ' WITH' in text: char = unichr(int(code, 16))
text = text.split(' WITH')[0] name = result[0]
last = text.split(' ')[-1] words = name.split(' ')
if len(last) <= 2: ascii = ''
char = last for key in special_keys:
else: if name == key or name.endswith(' ' + key):
for string in special: ascii = special[key]
if text.endswith(string):
char = special[string]
break break
if char: if not ascii:
if not char in ascii: for unit in units:
ascii[result[1]] = char if words[-1] == unit.upper():
else: ascii = unit
print result[0] break;
if not ascii:
name = re.sub(' WITH .+', '', name)
for word in remove:
name = re.sub(' ' + word, '', name)
words = name.split(' ')
if len(words[-1]) <= 2:
ascii = words[-1]
else:
no_ascii.append(name)
if ascii:
if 'SMALL' in words and not 'CAPITAL' in words:
ascii = ascii.lower()
chars[char]['ascii'] = ascii
f = open('../../source/Ox.Unicode/json/Ox.Unicode.json', 'w') f = open('../../source/Ox.Unicode/json/Ox.Unicode.json', 'w')
f.write(json.dumps(ascii, indent=4, sort_keys=True)) f.write(json.dumps(chars, indent=4, sort_keys=True))
f.close()
f = open('json/no_ascii.json', 'w')
f.write(json.dumps(sorted(no_ascii), indent=4))
f.close() f.close()