python-ox/ox/normalize.py

200 lines
6.1 KiB
Python
Raw Normal View History

2010-05-17 09:09:40 +00:00
# -*- coding: utf-8 -*-
2008-07-06 13:00:06 +00:00
# vi:si:et:sw=4:sts=4:ts=4
2010-05-17 09:09:40 +00:00
# GPL 2008
2008-04-29 16:08:15 +00:00
import re
2010-09-01 12:55:52 +00:00
import unicodedata
2008-04-28 09:35:20 +00:00
_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
2008-06-19 09:21:21 +00:00
"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
u'\xd4\xef', u'\xcf\xe9')
2008-04-28 09:35:20 +00:00
# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
2008-06-19 09:21:21 +00:00
if article[-1] not in ("'", '-'): article += ' '
_spArticles.append(article)
2008-04-28 09:35:20 +00:00
_noarticles = (
'los angeles',
'i am ',
'i be area',
'i call ',
'i come ',
'i confess',
'i hired ',
'i killed ',
'i know ',
'i live ',
'i love',
'i married',
'i never',
'i shot',
'i start',
'i was',
)
2008-04-28 09:35:20 +00:00
def canonicalTitle(title):
2008-06-19 09:21:21 +00:00
"""Return the title in the canonic format 'Movie Title, The'.
>>> canonicalTitle('The Movie Title')
'Movie Title, The'
>>> canonicalTitle('Los Angeles Plays Itself')
'Los Angeles Plays Itself'
2008-06-19 09:21:21 +00:00
"""
try:
if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
except IndexError: pass
ltitle = title.lower()
for start in _noarticles:
if ltitle.startswith(start):
return title
2008-06-19 09:21:21 +00:00
for article in _spArticles:
if ltitle.startswith(article):
lart = len(article)
title = '%s, %s' % (title[lart:], title[:lart])
if article[-1] == ' ': title = title[:-1]
break
## XXX: an attempt using a dictionary lookup.
##for artSeparator in (' ', "'", '-'):
## article = _articlesDict.get(ltitle.split(artSeparator)[0])
## if article is not None:
## lart = len(article)
## # check titles like "una", "I'm Mad" and "L'abbacchio".
## if title[lart:] == '' or (artSeparator != ' ' and
## title[lart:][1] != artSeparator): continue
## title = '%s, %s' % (title[lart:], title[:lart])
## if artSeparator == ' ': title = title[1:]
## break
return title
2008-04-28 09:35:20 +00:00
def normalizeTitle(title):
2008-06-19 09:21:21 +00:00
"""Return the title in the normal "The Title" format.
2008-06-19 09:21:21 +00:00
>>> normalizeTitle('Movie Title, The')
'The Movie Title'
"""
stitle = title.split(', ')
if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
sep = ' '
if stitle[-1][-1] in ("'", '-'): sep = ''
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title
2008-04-28 09:35:20 +00:00
2008-04-29 16:08:15 +00:00
def normalizeImdbId(imdbId):
2008-06-19 09:21:21 +00:00
"""Return 7 digit imdbId.
2008-06-19 09:21:21 +00:00
>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
'0159206'
>>> normalizeImdbId(159206)
'0159206'
>>> normalizeImdbId('tt0159206')
'0159206'
"""
if isinstance(imdbId, basestring):
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
elif isinstance(imdbId, int):
imdbId = "%07d" % imdbId
return imdbId
2008-04-29 16:08:15 +00:00
2008-05-06 11:16:33 +00:00
# Common suffixes in surnames.
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
2009-08-02 18:25:03 +00:00
'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')
2008-05-06 11:16:33 +00:00
def canonicalName(name):
"""Return the given name in canonical "Surname, Name" format.
It assumes that name is in the 'Name Surname' format.
>>> canonicalName('Jean Luc Godard')
'Godard, Jean Luc'
>>> canonicalName('Ivan Ivanov-Vano')
'Ivanov-Vano, Ivan'
>>> canonicalName('Gus Van Sant')
'Van Sant, Gus'
>>> canonicalName('Brian De Palma')
'De Palma, Brian'
"""
# XXX: some statistics (over 1852406 names):
# - just a surname: 51921
# - single surname, single name: 1792759
# - composed surname, composed name: 7726
# - composed surname, single name: 55623
# (2: 49259, 3: 5502, 4: 551)
# - single surname, composed name: 186604
# (2: 178315, 3: 6573, 4: 1219, 5: 352)
# Don't convert names already in the canonical format.
2010-05-17 09:09:40 +00:00
if name in ('Unknown Director', ):
return name
2008-05-06 11:16:33 +00:00
if name.find(', ') != -1: return name
sname = name.split(' ')
snl = len(sname)
if snl == 2:
# Just a name and a surname: how boring...
name = '%s, %s' % (sname[1], sname[0])
elif snl > 2:
lsname = [x.lower() for x in sname]
if snl == 3: _indexes = (0, snl-2)
else: _indexes = (0, snl-2, snl-3)
# Check for common surname prefixes at the beginning and near the end.
for index in _indexes:
if lsname[index] not in _sname_suffixes: continue
try:
# Build the surname.
surn = '%s %s' % (sname[index], sname[index+1])
del sname[index]
del sname[index]
try:
# Handle the "Jr." after the name.
if lsname[index+2].startswith('jr'):
surn += ' %s' % sname[index]
del sname[index]
except (IndexError, ValueError):
pass
name = '%s, %s' % (surn, ' '.join(sname))
break
except ValueError:
continue
else:
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
return name
def normalizeName(name):
"""Return a name in the normal "Name Surname" format.
>>> normalizeName('Godard, Jean Luc')
'Jean Luc Godard'
>>> normalizeName('Ivanov-Vano, Ivan')
'Ivan Ivanov-Vano'
>>> normalizeName('Van Sant, Gus')
'Gus Van Sant'
>>> normalizeName('De Palma, Brian')
'Brian De Palma'
"""
sname = name.split(', ')
if len(sname) == 2:
name = '%s %s' % (sname[1], sname[0])
return name
2010-09-01 11:31:18 +00:00
def normalizePath(path):
2010-09-01 17:57:29 +00:00
path = path.replace(':', '_').replace('/', '_')
2010-09-01 11:31:18 +00:00
if path.endswith('.'): path = path[:-1] + '_'
return path
2010-09-01 12:55:52 +00:00
def stripAccents(s):
if isinstance(s, str):
s = unicode(s)
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))