From 9b4cb6fe86f174f31fa69ad9c75ce6ad1f6f9c2c Mon Sep 17 00:00:00 2001 From: rlx Date: Wed, 2 Aug 2017 19:46:40 +0200 Subject: [PATCH] better is_asian_name --- ox/text.py | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/ox/text.py b/ox/text.py index 5299607..a411f92 100644 --- a/ox/text.py +++ b/ox/text.py @@ -65,7 +65,7 @@ ASIAN_FIRST_NAMES = [ ] # see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames # and http://en.wikipedia.org/wiki/List_of_Korean_family_names -ASIAN_NAMES = [ +ASIAN_LAST_NAMES = [ 'chan', 'chang', 'chao', 'chen', 'cheong', 'cheung', 'chong', 'choo', @@ -306,11 +306,34 @@ def get_sort_name(name): return len(first_names) and first_names[-1].lower() in names if is_asian_name(name): + hyphen = -1 names = name.split(' ') - if '-' in names[0]: - firstnames = names[0].split('-') - names[0] = '-'.join([firstnames[0], firstnames[1].lower()]) - return names[-1] + ' ' + ' '.join(names[:-1]) + if len(names) == 2: + + for i, name in enumerate(names): + if '-' in name: + return + names = name.replace('-').split(' ') + if len(names) == 2: + if names[0].lower() in ASIAN_LAST_NAMES: + lastname, firstname = names + else: + firstname, lastname = names + else: + names_ = name.split(' ') + if '-' in names_[0]: + lastname, firstname = [names[2], names[0] + '-' + names[1].lower()] + elif '-' in names_[1]: + lastname, firstname = [names[0], names[1] = '-' + names[2].lower()] + elif names[0] in ASIAN_FIRST_NAMES and names[2] not in ASIAN_FIRST_NAMES: + lastname, firstname = [names[2], names[0] + ' ' + names[1]] + elif names[0] not in ASIAN_FIRST_NAMES and names[2] in ASIAN_FIRST_NAMES: + lastname, firstname = [names[0], names[1] + ' ' + names[2]] + elif names[0] in ASIAN_LAST_NAMES: + lastname, firstname = [names[0], names[1] + ' ' + names[2]] + else: + lastname, firstname = [names[2], names[0] + ' ' + names[1]] + return '{} {}'.format(lastname, firstname) first_names = name.split(' ') last_names = [] @@ -328,7 +351,7 @@ def get_sort_name(name): add_name() name = ' '.join(last_names) if len(first_names): - separator = ' ' if last_names[0].lower() in ASIAN_NAMES else ', ' + separator = ' ' if last_names[0].lower() in ASIAN_LAST_NAMES else ', ' name += separator + ' '.join(first_names) return name @@ -385,11 +408,14 @@ def find_string(string, string0='', string1=''): def is_asian_name(name): names = name.replace('-', ' ').lower().split(' ') - if len(names) != 3: - return False - if names[0] in ASIAN_FIRST_NAMES and names[1] in ASIAN_FIRST_NAMES: - return True - return False + return (len(names) == 2 and not '-' in name and ( + (names[0] in ASIAN_FIRST_NAMES and names[1] in ASIAN_LAST_NAMES) or + (names[0] in ASIAN_LAST_NAMES and names[1] in ASIAN_FIRST_NAMES) + ) or ( + len(names) == 3 and names[1] in ASIAN_FIRST_NAMES and ( + names[0] in ASIAN_FIRST_NAMES or names[2] in ASIAN_FIRST_NAMES + ) + ) def parse_useragent(useragent): data = {}