python-ox/ox/normalize.py

213 lines
6.2 KiB
Python
Raw Normal View History

2010-05-17 09:09:40 +00:00
# -*- coding: utf-8 -*-
2008-07-06 13:00:06 +00:00
# vi:si:et:sw=4:sts=4:ts=4
2010-05-17 09:09:40 +00:00
# GPL 2008
2008-04-29 16:08:15 +00:00
import re
2010-09-01 12:55:52 +00:00
import unicodedata
2008-04-28 09:35:20 +00:00
2015-12-25 15:08:55 +00:00
2008-04-28 09:35:20 +00:00
_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
2008-06-19 09:21:21 +00:00
"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
u'\xd4\xef', u'\xcf\xe9')
2008-04-28 09:35:20 +00:00
# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
2016-06-08 13:32:46 +00:00
if article[-1] not in ("'", '-'):
article += ' '
2008-06-19 09:21:21 +00:00
_spArticles.append(article)
2008-04-28 09:35:20 +00:00
_noarticles = (
'los angeles',
'i am ',
'i be area',
'i call ',
'i come ',
'i confess',
'i hired ',
'i killed ',
'i know ',
'i live ',
'i love',
'i married',
'i never',
'i shot',
'i start',
'i was',
)
def canonical_title(title):
2008-06-19 09:21:21 +00:00
"""Return the title in the canonic format 'Movie Title, The'.
>>> canonical_title('The Movie Title')
2008-06-19 09:21:21 +00:00
'Movie Title, The'
>>> canonical_title('Los Angeles Plays Itself')
'Los Angeles Plays Itself'
2008-06-19 09:21:21 +00:00
"""
try:
2016-06-08 13:32:46 +00:00
if title.split(', ')[-1].lower() in _articlesDict:
return title
except IndexError:
pass
2008-06-19 09:21:21 +00:00
ltitle = title.lower()
for start in _noarticles:
if ltitle.startswith(start):
return title
2008-06-19 09:21:21 +00:00
for article in _spArticles:
if ltitle.startswith(article):
lart = len(article)
title = '%s, %s' % (title[lart:], title[:lart])
2016-06-08 13:32:46 +00:00
if article[-1] == ' ':
title = title[:-1]
2008-06-19 09:21:21 +00:00
break
## XXX: an attempt using a dictionary lookup.
##for artSeparator in (' ', "'", '-'):
## article = _articlesDict.get(ltitle.split(artSeparator)[0])
## if article is not None:
## lart = len(article)
## # check titles like "una", "I'm Mad" and "L'abbacchio".
## if title[lart:] == '' or (artSeparator != ' ' and
## title[lart:][1] != artSeparator): continue
## title = '%s, %s' % (title[lart:], title[:lart])
## if artSeparator == ' ': title = title[1:]
## break
return title
2008-04-28 09:35:20 +00:00
def normalize_title(title):
2008-06-19 09:21:21 +00:00
"""Return the title in the normal "The Title" format.
>>> normalize_title('Movie Title, The')
2008-06-19 09:21:21 +00:00
'The Movie Title'
"""
stitle = title.split(', ')
2016-06-08 13:32:46 +00:00
if len(stitle) > 1 and stitle[-1].lower() in _articlesDict:
2008-06-19 09:21:21 +00:00
sep = ' '
2016-06-08 13:32:46 +00:00
if stitle[-1][-1] in ("'", '-'):
sep = ''
2008-06-19 09:21:21 +00:00
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title
2008-04-28 09:35:20 +00:00
def normalize_imdbid(imdbId):
2008-06-19 09:21:21 +00:00
"""Return 7 digit imdbId.
>>> normalize_imdbid('http://www.imdb.com/title/tt0159206/')
2008-06-19 09:21:21 +00:00
'0159206'
>>> normalize_imdbid(159206)
2008-06-19 09:21:21 +00:00
'0159206'
>>> normalize_imdbid('tt0159206')
2008-06-19 09:21:21 +00:00
'0159206'
"""
2023-07-27 11:07:13 +00:00
if isinstance(imdbId, str):
2008-06-19 09:21:21 +00:00
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
elif isinstance(imdbId, int):
imdbId = "%07d" % imdbId
return imdbId
2008-04-29 16:08:15 +00:00
2008-05-06 11:16:33 +00:00
# Common suffixes in surnames.
_sname_suffixes = (
2012-09-23 13:21:38 +00:00
'al', 'ben', 'da', 'de', 'del', 'den', 'der', 'des', 'di', 'dos', 'du',
'e', 'el', 'la', 'le', 'the', 'vom', 'von', 'van', 'y'
)
2008-05-06 11:16:33 +00:00
def canonical_name(name):
2008-05-06 11:16:33 +00:00
"""Return the given name in canonical "Surname, Name" format.
It assumes that name is in the 'Name Surname' format.
>>> canonical_name('Jean Luc Godard')
2008-05-06 11:16:33 +00:00
'Godard, Jean Luc'
>>> canonical_name('Ivan Ivanov-Vano')
2008-05-06 11:16:33 +00:00
'Ivanov-Vano, Ivan'
>>> canonical_name('Gus Van Sant')
2008-05-06 11:16:33 +00:00
'Van Sant, Gus'
>>> canonical_name('Brian De Palma')
2008-05-06 11:16:33 +00:00
'De Palma, Brian'
"""
# XXX: some statistics (over 1852406 names):
# - just a surname: 51921
# - single surname, single name: 1792759
# - composed surname, composed name: 7726
# - composed surname, single name: 55623
# (2: 49259, 3: 5502, 4: 551)
# - single surname, composed name: 186604
# (2: 178315, 3: 6573, 4: 1219, 5: 352)
# Don't convert names already in the canonical format.
2010-05-17 09:09:40 +00:00
if name in ('Unknown Director', ):
return name
2016-06-08 13:32:46 +00:00
if name.find(', ') != -1:
return name
2008-05-06 11:16:33 +00:00
sname = name.split(' ')
snl = len(sname)
if snl == 2:
# Just a name and a surname: how boring...
name = '%s, %s' % (sname[1], sname[0])
elif snl > 2:
lsname = [x.lower() for x in sname]
2016-06-08 13:32:46 +00:00
if snl == 3:
_indexes = (0, snl-2)
else:
_indexes = (0, snl-2, snl-3)
2008-05-06 11:16:33 +00:00
# Check for common surname prefixes at the beginning and near the end.
for index in _indexes:
2016-06-08 13:32:46 +00:00
if lsname[index] not in _sname_suffixes:
continue
2008-05-06 11:16:33 +00:00
try:
# Build the surname.
surn = '%s %s' % (sname[index], sname[index+1])
del sname[index]
del sname[index]
try:
# Handle the "Jr." after the name.
if lsname[index+2].startswith('jr'):
surn += ' %s' % sname[index]
del sname[index]
except (IndexError, ValueError):
pass
name = '%s, %s' % (surn, ' '.join(sname))
break
except ValueError:
continue
else:
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
return name
def normalize_name(name):
2008-05-06 11:16:33 +00:00
"""Return a name in the normal "Name Surname" format.
>>> normalize_name('Godard, Jean Luc')
2008-05-06 11:16:33 +00:00
'Jean Luc Godard'
>>> normalize_name('Ivanov-Vano, Ivan')
2008-05-06 11:16:33 +00:00
'Ivan Ivanov-Vano'
>>> normalize_name('Van Sant, Gus')
2008-05-06 11:16:33 +00:00
'Gus Van Sant'
>>> normalize_name('De Palma, Brian')
2008-05-06 11:16:33 +00:00
'Brian De Palma'
"""
sname = name.split(', ')
if len(sname) == 2:
name = '%s %s' % (sname[1], sname[0])
return name
def normalize_path(path):
2010-09-01 17:57:29 +00:00
path = path.replace(':', '_').replace('/', '_')
2016-06-08 13:32:46 +00:00
if path.endswith('.'):
path = path[:-1] + '_'
2010-09-01 11:31:18 +00:00
return path
def strip_accents(s):
2010-09-01 12:55:52 +00:00
if isinstance(s, str):
2016-06-08 13:32:46 +00:00
s = s.decode('utf-8')
2010-09-01 12:55:52 +00:00
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))