155 lines
6.1 KiB
Python
155 lines
6.1 KiB
Python
# util.py - common utility functions
|
||
# coding: utf-8
|
||
#
|
||
# Copyright (C) 2012, 2013 Arthur de Jong
|
||
#
|
||
# This library is free software; you can redistribute it and/or
|
||
# modify it under the terms of the GNU Lesser General Public
|
||
# License as published by the Free Software Foundation; either
|
||
# version 2.1 of the License, or (at your option) any later version.
|
||
#
|
||
# This library is distributed in the hope that it will be useful,
|
||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
# Lesser General Public License for more details.
|
||
#
|
||
# You should have received a copy of the GNU Lesser General Public
|
||
# License along with this library; if not, write to the Free Software
|
||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||
# 02110-1301 USA
|
||
|
||
"""Common utility functions for other stdnum modules.
|
||
|
||
This module is meant for internal use by stdnum modules and is not
|
||
guaranteed to remain stable and as such not part of the public API of
|
||
stdnum.
|
||
"""
|
||
|
||
import pkgutil
|
||
import pydoc
|
||
import re
|
||
import sys
|
||
import unicodedata
|
||
|
||
from stdnum.exceptions import *
|
||
|
||
|
||
_strip_doctest_re = re.compile('^>>> .*\Z', re.DOTALL | re.MULTILINE)
|
||
|
||
|
||
def _mk_char_map(mapping):
|
||
"""Transform a dictionary with comma separated uniode chracter names
|
||
to tuples with unicode characters as key."""
|
||
for key, value in mapping.items():
|
||
for char in key.split(','):
|
||
try:
|
||
yield (unicodedata.lookup(char), value)
|
||
except KeyError: # pragma: no cover (does not happen on Python3)
|
||
pass
|
||
|
||
|
||
# build mapping of Unicode characters to equivalent ASCII characters
|
||
_char_map = dict(_mk_char_map({
|
||
'HYPHEN-MINUS,ARMENIAN HYPHEN,HEBREW PUNCTUATION MAQAF,HYPHEN,'
|
||
'NON-BREAKING HYPHEN,FIGURE DASH,EN DASH,EM DASH,HORIZONTAL BAR,'
|
||
'SMALL HYPHEN-MINUS,FULLWIDTH HYPHEN-MINUS,MONGOLIAN NIRUGU,OVERLINE,'
|
||
'HYPHEN BULLET,MACRON,MODIFIER LETTER MINUS SIGN,FULLWIDTH MACRON,'
|
||
'OGHAM SPACE MARK,SUPERSCRIPT MINUS,SUBSCRIPT MINUS,MINUS SIGN,'
|
||
'HORIZONTAL LINE EXTENSION,HORIZONTAL SCAN LINE-1,HORIZONTAL SCAN LINE-3,'
|
||
'HORIZONTAL SCAN LINE-7,HORIZONTAL SCAN LINE-9,STRAIGHTNESS': '-',
|
||
'ASTERISK,ARABIC FIVE POINTED STAR,SYRIAC HARKLEAN ASTERISCUS,'
|
||
'FLOWER PUNCTUATION MARK,VAI FULL STOP,SMALL ASTERISK,FULLWIDTH ASTERISK,'
|
||
'ASTERISK OPERATOR,STAR OPERATOR,HEAVY ASTERISK,LOW ASTERISK,'
|
||
'OPEN CENTRE ASTERISK,EIGHT SPOKED ASTERISK,SIXTEEN POINTED ASTERISK,'
|
||
'TEARDROP-SPOKED ASTERISK,OPEN CENTRE TEARDROP-SPOKED ASTERISK,'
|
||
'HEAVY TEARDROP-SPOKED ASTERISK,EIGHT TEARDROP-SPOKED PROPELLER ASTERISK,'
|
||
'HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK,'
|
||
'ARABIC FIVE POINTED STAR': '*',
|
||
'COMMA,ARABIC COMMA,SINGLE LOW-9 QUOTATION MARK,IDEOGRAPHIC COMMA,'
|
||
'ARABIC DECIMAL SEPARATOR,ARABIC THOUSANDS SEPARATOR,PRIME,RAISED COMMA,'
|
||
'PRESENTATION FORM FOR VERTICAL COMMA,SMALL COMMA,'
|
||
'SMALL IDEOGRAPHIC COMMA,FULLWIDTH COMMA,CEDILLA': ',',
|
||
'FULL STOP,MIDDLE DOT,GREEK ANO TELEIA,ARABIC FULL STOP,'
|
||
'IDEOGRAPHIC FULL STOP,SYRIAC SUPRALINEAR FULL STOP,'
|
||
'SYRIAC SUBLINEAR FULL STOP,SAMARITAN PUNCTUATION NEQUDAA,'
|
||
'TIBETAN MARK INTERSYLLABIC TSHEG,TIBETAN MARK DELIMITER TSHEG BSTAR,'
|
||
'RUNIC SINGLE PUNCTUATION,BULLET,ONE DOT LEADER,HYPHENATION POINT,'
|
||
'WORD SEPARATOR MIDDLE DOT,RAISED DOT,KATAKANA MIDDLE DOT,'
|
||
'SMALL FULL STOP,FULLWIDTH FULL STOP,HALFWIDTH KATAKANA MIDDLE DOT,'
|
||
'AEGEAN WORD SEPARATOR DOT,PHOENICIAN WORD SEPARATOR,'
|
||
'KHAROSHTHI PUNCTUATION DOT,DOT ABOVE,ARABIC SYMBOL DOT ABOVE,'
|
||
'ARABIC SYMBOL DOT BELOW,BULLET OPERATOR,DOT OPERATOR': '.',
|
||
'SOLIDUS,SAMARITAN PUNCTUATION ARKAANU,FULLWIDTH SOLIDUS,DIVISION SLASH,'
|
||
'MATHEMATICAL RISING DIAGONAL,BIG SOLIDUS,FRACTION SLASH': '/',
|
||
'COLON,ETHIOPIC WORDSPACE,RUNIC MULTIPLE PUNCTUATION,MONGOLIAN COLON,'
|
||
'PRESENTATION FORM FOR VERTICAL COLON,FULLWIDTH COLON,'
|
||
'PRESENTATION FORM FOR VERTICAL TWO DOT LEADER,SMALL COLON': ':',
|
||
'SPACE,NO-BREAK SPACE,EN QUAD,EM QUAD,EN SPACE,EM SPACE,'
|
||
'THREE-PER-EM SPACE,FOUR-PER-EM SPACE,SIX-PER-EM SPACE,FIGURE SPACE,'
|
||
'PUNCTUATION SPACE,THIN SPACE,HAIR SPACE,NARROW NO-BREAK SPACE,'
|
||
'MEDIUM MATHEMATICAL SPACE,IDEOGRAPHIC SPACE': ' ',
|
||
}))
|
||
|
||
|
||
def _clean_chars(number):
|
||
"""Replace various Unicode characters with their ASCII counterpart."""
|
||
return ''.join(_char_map.get(x, x) for x in number)
|
||
|
||
|
||
def clean(number, deletechars=''):
|
||
"""Remove the specified characters from the supplied number.
|
||
|
||
>>> clean('123-456:78 9', ' -:')
|
||
'123456789'
|
||
>>> clean('1–2—3―4')
|
||
'1-2-3-4'
|
||
"""
|
||
try:
|
||
number = ''.join(x for x in number)
|
||
except:
|
||
raise InvalidFormat()
|
||
if sys.version < '3' and isinstance(number, str): # pragma: no cover (Python 2/3 specific code)
|
||
try:
|
||
number = _clean_chars(number.decode()).encode()
|
||
except UnicodeError:
|
||
try:
|
||
number = _clean_chars(number.decode('utf-8')).encode('utf-8')
|
||
except UnicodeError:
|
||
pass
|
||
else: # pragma: no cover (Python 2/3 specific code)
|
||
number = _clean_chars(number)
|
||
return ''.join(x for x in number if x not in deletechars)
|
||
|
||
|
||
def get_number_modules(base='stdnum'):
|
||
"""Yield all the module and package names under the specified module."""
|
||
__import__(base)
|
||
module = sys.modules[base]
|
||
for loader, name, is_pkg in pkgutil.walk_packages(
|
||
module.__path__, module.__name__ + '.',
|
||
onerror=lambda x: None
|
||
):
|
||
__import__(name)
|
||
module = sys.modules[name]
|
||
if hasattr(module, 'validate'):
|
||
yield module
|
||
|
||
|
||
def get_module_name(module):
|
||
"""Return the short description of the number."""
|
||
return pydoc.splitdoc(pydoc.getdoc(module))[0].strip('.')
|
||
|
||
|
||
def get_module_description(module):
|
||
"""Return a description of the number."""
|
||
doc = pydoc.splitdoc(pydoc.getdoc(module))[1]
|
||
# remove the doctests
|
||
return _strip_doctest_re.sub('', doc[1]).strip(),
|
||
|
||
|
||
def get_module_list():
|
||
for module in get_number_modules():
|
||
yield ' * %s: %s' % (
|
||
module.__name__.replace('stdnum.', ''),
|
||
get_module_name(module),
|
||
)
|