handle some more corner cases when sorting names

This commit is contained in:
rolux 2011-10-11 17:19:31 +02:00
parent e93484d110
commit 8b756b888a

View file

@ -18,21 +18,21 @@ ARTICLES = list(set([
# see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames # see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
# and http://en.wikipedia.org/wiki/List_of_Korean_family_names # and http://en.wikipedia.org/wiki/List_of_Korean_family_names
ASIAN_NAMES = [ ASIAN_NAMES = [
'Chan', 'Chang', 'Chao', 'chan', 'chang', 'chao',
'Chen', 'Cheong', 'Cheung', 'chen', 'cheong', 'cheung',
'Chong', 'Choo', 'chong', 'choo',
'Chu', 'Chun', 'chu', 'chun',
'Hou', 'Hsieh', 'Hsu', 'Hu', 'Huang', 'hou', 'hsieh', 'hsu', 'hu', 'huang',
'Kuo', 'kuo',
'Li', 'Liang', 'Lin', 'Liu', 'li', 'liang', 'lin', 'liu',
'_Park', '_park',
'Sun', 'Sung', 'sun', 'sung',
'Tsao', 'tsao',
'Wang', 'Wong', 'wang', 'Wong',
'Yang', 'Yeong', 'Yeung' 'yang', 'yeong', 'yeung'
] ]
PREFIXES = [ PREFIXES = [
'al', 'da', 'de', 'del', 'dem', 'den', 'der', 'di', 'du', 'al', 'da', 'de', 'del', 'dem', 'den', 'der', 'di', 'dos', 'du',
'e', 'el', 'la', 'the', 'van', 'vom', 'von', 'y', 'zu' 'e', 'el', 'la', 'the', 'van', 'vom', 'von', 'y', 'zu'
] ]
MIDFIXES = ['und'] MIDFIXES = ['und']
@ -62,12 +62,20 @@ def get_sort_name(name):
>>> get_sort_name('Bing Wang') >>> get_sort_name('Bing Wang')
'Wang Bing' 'Wang Bing'
>>> get_sort_name('The Queen of England')
'Queen of England, The'
>>> get_sort_name('Sham 69')
'Sham 69'
>>> get_sort_name('Scorsese, Martin') >>> get_sort_name('Scorsese, Martin')
'Scorsese, Martin' 'Scorsese, Martin'
""" """
if ', ' in name: if not ' ' in name or ', ' in name:
return name return name
if name.lower().startswith('the '):
return get_sort_title(name)
def add_name(): def add_name():
if len(first_names): if len(first_names):
last_names.insert(0, first_names.pop()) last_names.insert(0, first_names.pop())
@ -75,6 +83,8 @@ def get_sort_name(name):
return len(first_names) and first_names[-1].lower() in names return len(first_names) and first_names[-1].lower() in names
first_names = name.split(' ') first_names = name.split(' ')
last_names = [] last_names = []
if re.search('^[0-9]+$', first_names[-1]):
add_name()
if find_name(SUFFIXES): if find_name(SUFFIXES):
add_name() add_name()
add_name() add_name()
@ -83,8 +93,11 @@ def get_sort_name(name):
add_name() add_name()
while find_name(PREFIXES): while find_name(PREFIXES):
add_name() add_name()
separator = ' ' if last_names[0] in ASIAN_NAMES else ', ' name = ' '.join(last_names)
return separator.join([' '.join(last_names), ' '.join(first_names)]) if len(first_names):
separator = ' ' if last_names[0].lower() in ASIAN_NAMES else ', '
name += separator + ' '.join(first_names)
return name
def get_sort_title(title): def get_sort_title(title):
""" """