python-ox/ox/normalize.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import re

_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
             "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
             'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
             'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
             'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
             u'\xd4\xef', u'\xcf\xe9')

# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
    if article[-1] not in ("'", '-'): article += ' '
    _spArticles.append(article)

def canonicalTitle(title):
    """Return the title in the canonic format 'Movie Title, The'.
    
    >>> canonicalTitle('The Movie Title')
    'Movie Title, The'
    """
    try:
        if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
    except IndexError: pass
    ltitle = title.lower()
    for article in _spArticles:
        if ltitle.startswith(article):
            lart = len(article)
            title = '%s, %s' % (title[lart:], title[:lart])
            if article[-1] == ' ': title = title[:-1]
            break
    ## XXX: an attempt using a dictionary lookup.
    ##for artSeparator in (' ', "'", '-'):
    ##    article = _articlesDict.get(ltitle.split(artSeparator)[0])
    ##    if article is not None:
    ##        lart = len(article)
    ##        # check titles like "una", "I'm Mad" and "L'abbacchio".
    ##        if title[lart:] == '' or (artSeparator != ' ' and
    ##                                title[lart:][1] != artSeparator): continue
    ##        title = '%s, %s' % (title[lart:], title[:lart])
    ##        if artSeparator == ' ': title = title[1:]
    ##        break
    return title

def normalizeTitle(title):
    """Return the title in the normal "The Title" format.

    >>> normalizeTitle('Movie Title, The')
    'The Movie Title'
    """
    stitle = title.split(', ')
    if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
        sep = ' '
        if stitle[-1][-1] in ("'", '-'): sep = ''
        title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
    return title

def normalizeImdbId(imdbId):
    """Return 7 digit imdbId.

    >>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
    '0159206'
    >>> normalizeImdbId(159206)
    '0159206'
    >>> normalizeImdbId('tt0159206')
    '0159206'
    """
    if isinstance(imdbId, basestring):
        imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
    elif isinstance(imdbId, int):
        imdbId = "%07d" % imdbId
    return imdbId


# Common suffixes in surnames.
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
                    'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')

def canonicalName(name):
    """Return the given name in canonical "Surname, Name" format.
    It assumes that name is in the 'Name Surname' format.
    
    >>> canonicalName('Jean Luc Godard')
    'Godard, Jean Luc'

    >>> canonicalName('Ivan Ivanov-Vano')
    'Ivanov-Vano, Ivan'

    >>> canonicalName('Gus Van Sant')
    'Van Sant, Gus'

    >>> canonicalName('Brian De Palma')
    'De Palma, Brian'
    """

    # XXX: some statistics (over 1852406 names):
    #      - just a surname:                 51921
    #      - single surname, single name:  1792759
    #      - composed surname, composed name: 7726
    #      - composed surname, single name:  55623
    #        (2: 49259, 3: 5502, 4: 551)
    #      - single surname, composed name: 186604
    #        (2: 178315, 3: 6573, 4: 1219, 5: 352)
    # Don't convert names already in the canonical format.
    if name.find(', ') != -1: return name
    sname = name.split(' ')
    snl = len(sname)
    if snl == 2:
        # Just a name and a surname: how boring...
        name = '%s, %s' % (sname[1], sname[0])
    elif snl > 2:
        lsname = [x.lower() for x in sname]
        if snl == 3: _indexes = (0, snl-2)
        else: _indexes = (0, snl-2, snl-3)
        # Check for common surname prefixes at the beginning and near the end.
        for index in _indexes:
            if lsname[index] not in _sname_suffixes: continue
            try:
                # Build the surname.
                surn = '%s %s' % (sname[index], sname[index+1])
                del sname[index]
                del sname[index]
                try:
                    # Handle the "Jr." after the name.
                    if lsname[index+2].startswith('jr'):
                        surn += ' %s' % sname[index]
                        del sname[index]
                except (IndexError, ValueError):
                    pass
                name = '%s, %s' % (surn, ' '.join(sname))
                break
            except ValueError:
                continue
        else:
            name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
    return name

def normalizeName(name):
    """Return a name in the normal "Name Surname" format.
    
    >>> normalizeName('Godard, Jean Luc')
    'Jean Luc Godard'

    >>> normalizeName('Ivanov-Vano, Ivan')
    'Ivan Ivanov-Vano'

    >>> normalizeName('Van Sant, Gus')
    'Gus Van Sant'

    >>> normalizeName('De Palma, Brian')
    'Brian De Palma'
    """
    sname = name.split(', ')
    if len(sname) == 2:
        name = '%s %s' % (sname[1], sname[0])
    return name
add normalize title 2008-04-28 09:35:20 +00:00			`# -- coding: utf-8 --`
move and rename some 2008-07-06 13:00:06 +00:00			`# vi:si:et:sw=4:sts=4:ts=4`
			`# GPL 2008`
add normalizeImdbId 2008-04-29 16:08:15 +00:00			`import re`
add normalize title 2008-04-28 09:35:20 +00:00
			`_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',`
			`'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',`
			`'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',`
			`'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',`
			`u'\xd4\xef', u'\xcf\xe9')`
add normalize title 2008-04-28 09:35:20 +00:00
			`# Articles in a dictionary.`
			`_articlesDict = dict([(x, x) for x in _articles])`
			`_spArticles = []`
			`for article in _articles:`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`if article[-1] not in ("'", '-'): article += ' '`
			`_spArticles.append(article)`
add normalize title 2008-04-28 09:35:20 +00:00
			`def canonicalTitle(title):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""Return the title in the canonic format 'Movie Title, The'.`

			`>>> canonicalTitle('The Movie Title')`
			`'Movie Title, The'`
			`"""`
			`try:`
			`if _articlesDict.has_key(title.split(', ')[-1].lower()): return title`
			`except IndexError: pass`
			`ltitle = title.lower()`
			`for article in _spArticles:`
			`if ltitle.startswith(article):`
			`lart = len(article)`
			`title = '%s, %s' % (title[lart:], title[:lart])`
			`if article[-1] == ' ': title = title[:-1]`
			`break`
			`## XXX: an attempt using a dictionary lookup.`
			`##for artSeparator in (' ', "'", '-'):`
			`## article = _articlesDict.get(ltitle.split(artSeparator)[0])`
			`## if article is not None:`
			`## lart = len(article)`
			`## # check titles like "una", "I'm Mad" and "L'abbacchio".`
			`## if title[lart:] == '' or (artSeparator != ' ' and`
			`## title[lart:][1] != artSeparator): continue`
			`## title = '%s, %s' % (title[lart:], title[:lart])`
			`## if artSeparator == ' ': title = title[1:]`
			`## break`
			`return title`
add normalize title 2008-04-28 09:35:20 +00:00
			`def normalizeTitle(title):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""Return the title in the normal "The Title" format.`
add test and cleanup some errors found while doing so 2008-05-05 18:12:27 +00:00
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`>>> normalizeTitle('Movie Title, The')`
			`'The Movie Title'`
			`"""`
			`stitle = title.split(', ')`
			`if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):`
			`sep = ' '`
			`if stitle[-1][-1] in ("'", '-'): sep = ''`
			`title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))`
			`return title`
add normalize title 2008-04-28 09:35:20 +00:00
add normalizeImdbId 2008-04-29 16:08:15 +00:00			`def normalizeImdbId(imdbId):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""Return 7 digit imdbId.`
add test and cleanup some errors found while doing so 2008-05-05 18:12:27 +00:00
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')`
			`'0159206'`
			`>>> normalizeImdbId(159206)`
			`'0159206'`
			`>>> normalizeImdbId('tt0159206')`
			`'0159206'`
			`"""`
			`if isinstance(imdbId, basestring):`
			`imdbId = re.sub('.(\d{7}).', '\\1', imdbId)`
			`elif isinstance(imdbId, int):`
			`imdbId = "%07d" % imdbId`
			`return imdbId`
add normalizeImdbId 2008-04-29 16:08:15 +00:00
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00
			`# Common suffixes in surnames.`
			`_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',`
vom possible name suffix 2009-08-02 18:25:03 +00:00			`'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00
			`def canonicalName(name):`
			`"""Return the given name in canonical "Surname, Name" format.`
			`It assumes that name is in the 'Name Surname' format.`

			`>>> canonicalName('Jean Luc Godard')`
			`'Godard, Jean Luc'`

			`>>> canonicalName('Ivan Ivanov-Vano')`
			`'Ivanov-Vano, Ivan'`

			`>>> canonicalName('Gus Van Sant')`
			`'Van Sant, Gus'`

			`>>> canonicalName('Brian De Palma')`
			`'De Palma, Brian'`
			`"""`

			`# XXX: some statistics (over 1852406 names):`
			`# - just a surname: 51921`
			`# - single surname, single name: 1792759`
			`# - composed surname, composed name: 7726`
			`# - composed surname, single name: 55623`
			`# (2: 49259, 3: 5502, 4: 551)`
			`# - single surname, composed name: 186604`
			`# (2: 178315, 3: 6573, 4: 1219, 5: 352)`
			`# Don't convert names already in the canonical format.`
			`if name.find(', ') != -1: return name`
			`sname = name.split(' ')`
			`snl = len(sname)`
			`if snl == 2:`
			`# Just a name and a surname: how boring...`
			`name = '%s, %s' % (sname[1], sname[0])`
			`elif snl > 2:`
			`lsname = [x.lower() for x in sname]`
			`if snl == 3: _indexes = (0, snl-2)`
			`else: _indexes = (0, snl-2, snl-3)`
			`# Check for common surname prefixes at the beginning and near the end.`
			`for index in _indexes:`
			`if lsname[index] not in _sname_suffixes: continue`
			`try:`
			`# Build the surname.`
			`surn = '%s %s' % (sname[index], sname[index+1])`
			`del sname[index]`
			`del sname[index]`
			`try:`
			`# Handle the "Jr." after the name.`
			`if lsname[index+2].startswith('jr'):`
			`surn += ' %s' % sname[index]`
			`del sname[index]`
			`except (IndexError, ValueError):`
			`pass`
			`name = '%s, %s' % (surn, ' '.join(sname))`
			`break`
			`except ValueError:`
			`continue`
			`else:`
			`name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))`
			`return name`

			`def normalizeName(name):`
			`"""Return a name in the normal "Name Surname" format.`

			`>>> normalizeName('Godard, Jean Luc')`
			`'Jean Luc Godard'`

			`>>> normalizeName('Ivanov-Vano, Ivan')`
			`'Ivan Ivanov-Vano'`

			`>>> normalizeName('Van Sant, Gus')`
			`'Gus Van Sant'`

			`>>> normalizeName('De Palma, Brian')`
			`'Brian De Palma'`
			`"""`
			`sname = name.split(', ')`
			`if len(sname) == 2:`
			`name = '%s %s' % (sname[1], sname[0])`
			`return name`