better is_asian_name

This commit is contained in:
rlx 2017-08-02 19:46:40 +02:00
parent fdc68a85e2
commit 9b4cb6fe86

View file

@ -65,7 +65,7 @@ ASIAN_FIRST_NAMES = [
] ]
# see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames # see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
# and http://en.wikipedia.org/wiki/List_of_Korean_family_names # and http://en.wikipedia.org/wiki/List_of_Korean_family_names
ASIAN_NAMES = [ ASIAN_LAST_NAMES = [
'chan', 'chang', 'chao', 'chan', 'chang', 'chao',
'chen', 'cheong', 'cheung', 'chen', 'cheong', 'cheung',
'chong', 'choo', 'chong', 'choo',
@ -306,11 +306,34 @@ def get_sort_name(name):
return len(first_names) and first_names[-1].lower() in names return len(first_names) and first_names[-1].lower() in names
if is_asian_name(name): if is_asian_name(name):
hyphen = -1
names = name.split(' ') names = name.split(' ')
if '-' in names[0]: if len(names) == 2:
firstnames = names[0].split('-')
names[0] = '-'.join([firstnames[0], firstnames[1].lower()]) for i, name in enumerate(names):
return names[-1] + ' ' + ' '.join(names[:-1]) if '-' in name:
return
names = name.replace('-').split(' ')
if len(names) == 2:
if names[0].lower() in ASIAN_LAST_NAMES:
lastname, firstname = names
else:
firstname, lastname = names
else:
names_ = name.split(' ')
if '-' in names_[0]:
lastname, firstname = [names[2], names[0] + '-' + names[1].lower()]
elif '-' in names_[1]:
lastname, firstname = [names[0], names[1] = '-' + names[2].lower()]
elif names[0] in ASIAN_FIRST_NAMES and names[2] not in ASIAN_FIRST_NAMES:
lastname, firstname = [names[2], names[0] + ' ' + names[1]]
elif names[0] not in ASIAN_FIRST_NAMES and names[2] in ASIAN_FIRST_NAMES:
lastname, firstname = [names[0], names[1] + ' ' + names[2]]
elif names[0] in ASIAN_LAST_NAMES:
lastname, firstname = [names[0], names[1] + ' ' + names[2]]
else:
lastname, firstname = [names[2], names[0] + ' ' + names[1]]
return '{} {}'.format(lastname, firstname)
first_names = name.split(' ') first_names = name.split(' ')
last_names = [] last_names = []
@ -328,7 +351,7 @@ def get_sort_name(name):
add_name() add_name()
name = ' '.join(last_names) name = ' '.join(last_names)
if len(first_names): if len(first_names):
separator = ' ' if last_names[0].lower() in ASIAN_NAMES else ', ' separator = ' ' if last_names[0].lower() in ASIAN_LAST_NAMES else ', '
name += separator + ' '.join(first_names) name += separator + ' '.join(first_names)
return name return name
@ -385,11 +408,14 @@ def find_string(string, string0='', string1=''):
def is_asian_name(name): def is_asian_name(name):
names = name.replace('-', ' ').lower().split(' ') names = name.replace('-', ' ').lower().split(' ')
if len(names) != 3: return (len(names) == 2 and not '-' in name and (
return False (names[0] in ASIAN_FIRST_NAMES and names[1] in ASIAN_LAST_NAMES) or
if names[0] in ASIAN_FIRST_NAMES and names[1] in ASIAN_FIRST_NAMES: (names[0] in ASIAN_LAST_NAMES and names[1] in ASIAN_FIRST_NAMES)
return True ) or (
return False len(names) == 3 and names[1] in ASIAN_FIRST_NAMES and (
names[0] in ASIAN_FIRST_NAMES or names[2] in ASIAN_FIRST_NAMES
)
)
def parse_useragent(useragent): def parse_useragent(useragent):
data = {} data = {}