canonicalName and normalizeName
This commit is contained in:
parent
ea43810f2a
commit
4cfc111aef
1 changed files with 84 additions and 0 deletions
|
@ -75,3 +75,87 @@ def normalizeImdbId(imdbId):
|
|||
imdbId = "%07d" % imdbId
|
||||
return imdbId
|
||||
|
||||
|
||||
# Common suffixes in surnames.
|
||||
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
|
||||
'e', 'von', 'the', 'di', 'du', 'el', 'al')
|
||||
|
||||
def canonicalName(name):
|
||||
"""Return the given name in canonical "Surname, Name" format.
|
||||
It assumes that name is in the 'Name Surname' format.
|
||||
|
||||
>>> canonicalName('Jean Luc Godard')
|
||||
'Godard, Jean Luc'
|
||||
|
||||
>>> canonicalName('Ivan Ivanov-Vano')
|
||||
'Ivanov-Vano, Ivan'
|
||||
|
||||
>>> canonicalName('Gus Van Sant')
|
||||
'Van Sant, Gus'
|
||||
|
||||
>>> canonicalName('Brian De Palma')
|
||||
'De Palma, Brian'
|
||||
"""
|
||||
|
||||
# XXX: some statistics (over 1852406 names):
|
||||
# - just a surname: 51921
|
||||
# - single surname, single name: 1792759
|
||||
# - composed surname, composed name: 7726
|
||||
# - composed surname, single name: 55623
|
||||
# (2: 49259, 3: 5502, 4: 551)
|
||||
# - single surname, composed name: 186604
|
||||
# (2: 178315, 3: 6573, 4: 1219, 5: 352)
|
||||
# Don't convert names already in the canonical format.
|
||||
if name.find(', ') != -1: return name
|
||||
sname = name.split(' ')
|
||||
snl = len(sname)
|
||||
if snl == 2:
|
||||
# Just a name and a surname: how boring...
|
||||
name = '%s, %s' % (sname[1], sname[0])
|
||||
elif snl > 2:
|
||||
lsname = [x.lower() for x in sname]
|
||||
if snl == 3: _indexes = (0, snl-2)
|
||||
else: _indexes = (0, snl-2, snl-3)
|
||||
# Check for common surname prefixes at the beginning and near the end.
|
||||
for index in _indexes:
|
||||
if lsname[index] not in _sname_suffixes: continue
|
||||
try:
|
||||
# Build the surname.
|
||||
surn = '%s %s' % (sname[index], sname[index+1])
|
||||
del sname[index]
|
||||
del sname[index]
|
||||
try:
|
||||
# Handle the "Jr." after the name.
|
||||
if lsname[index+2].startswith('jr'):
|
||||
surn += ' %s' % sname[index]
|
||||
del sname[index]
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
name = '%s, %s' % (surn, ' '.join(sname))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
|
||||
return name
|
||||
|
||||
def normalizeName(name):
|
||||
"""Return a name in the normal "Name Surname" format.
|
||||
|
||||
>>> normalizeName('Godard, Jean Luc')
|
||||
'Jean Luc Godard'
|
||||
|
||||
>>> normalizeName('Ivanov-Vano, Ivan')
|
||||
'Ivan Ivanov-Vano'
|
||||
|
||||
>>> normalizeName('Van Sant, Gus')
|
||||
'Gus Van Sant'
|
||||
|
||||
>>> normalizeName('De Palma, Brian')
|
||||
'Brian De Palma'
|
||||
"""
|
||||
sname = name.split(', ')
|
||||
if len(sname) == 2:
|
||||
name = '%s %s' % (sname[1], sname[0])
|
||||
return name
|
||||
|
||||
|
|
Loading…
Reference in a new issue