python-ox/ox/normalize.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import re
import unicodedata


_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
             "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
             'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
             'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
             'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
             u'\xd4\xef', u'\xcf\xe9')

# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
    if article[-1] not in ("'", '-'):
        article += ' '
    _spArticles.append(article)

_noarticles = (
    'los angeles',
    'i am ',
    'i be area',
    'i call ',
    'i come ',
    'i confess',
    'i hired ',
    'i killed ',
    'i know ',
    'i live ',
    'i love',
    'i married',
    'i never',
    'i shot',
    'i start',
    'i was',
)

def canonical_title(title):
    """Return the title in the canonic format 'Movie Title, The'.

    >>> canonical_title('The Movie Title')
    'Movie Title, The'

    >>> canonical_title('Los Angeles Plays Itself')
    'Los Angeles Plays Itself'
    """
    try:
        if title.split(', ')[-1].lower() in _articlesDict:
            return title
    except IndexError:
        pass
    ltitle = title.lower()
    for start in _noarticles:
        if ltitle.startswith(start):
            return title
    for article in _spArticles:
        if ltitle.startswith(article):
            lart = len(article)
            title = '%s, %s' % (title[lart:], title[:lart])
            if article[-1] == ' ':
                title = title[:-1]
            break
    ## XXX: an attempt using a dictionary lookup.
    ##for artSeparator in (' ', "'", '-'):
    ##    article = _articlesDict.get(ltitle.split(artSeparator)[0])
    ##    if article is not None:
    ##        lart = len(article)
    ##        # check titles like "una", "I'm Mad" and "L'abbacchio".
    ##        if title[lart:] == '' or (artSeparator != ' ' and
    ##                                title[lart:][1] != artSeparator): continue
    ##        title = '%s, %s' % (title[lart:], title[:lart])
    ##        if artSeparator == ' ': title = title[1:]
    ##        break
    return title

def normalize_title(title):
    """Return the title in the normal "The Title" format.

    >>> normalize_title('Movie Title, The')
    'The Movie Title'
    """
    stitle = title.split(', ')
    if len(stitle) > 1 and stitle[-1].lower() in _articlesDict:
        sep = ' '
        if stitle[-1][-1] in ("'", '-'):
            sep = ''
        title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
    return title

def normalize_imdbid(imdbId):
    """Return 7 digit imdbId.

    >>> normalize_imdbid('http://www.imdb.com/title/tt0159206/')
    '0159206'
    >>> normalize_imdbid(159206)
    '0159206'
    >>> normalize_imdbid('tt0159206')
    '0159206'
    """
    if isinstance(imdbId, str):
        imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
    elif isinstance(imdbId, int):
        imdbId = "%07d" % imdbId
    return imdbId


# Common suffixes in surnames.
_sname_suffixes = (
    'al', 'ben', 'da', 'de', 'del', 'den', 'der', 'des', 'di', 'dos', 'du',
    'e', 'el', 'la', 'le', 'the', 'vom', 'von', 'van', 'y'
)

def canonical_name(name):
    """Return the given name in canonical "Surname, Name" format.
    It assumes that name is in the 'Name Surname' format.

    >>> canonical_name('Jean Luc Godard')
    'Godard, Jean Luc'

    >>> canonical_name('Ivan Ivanov-Vano')
    'Ivanov-Vano, Ivan'

    >>> canonical_name('Gus Van Sant')
    'Van Sant, Gus'

    >>> canonical_name('Brian De Palma')
    'De Palma, Brian'
    """

    # XXX: some statistics (over 1852406 names):
    #      - just a surname:                 51921
    #      - single surname, single name:  1792759
    #      - composed surname, composed name: 7726
    #      - composed surname, single name:  55623
    #        (2: 49259, 3: 5502, 4: 551)
    #      - single surname, composed name: 186604
    #        (2: 178315, 3: 6573, 4: 1219, 5: 352)
    # Don't convert names already in the canonical format.
    if name in ('Unknown Director', ):
        return name
    if name.find(', ') != -1:
        return name
    sname = name.split(' ')
    snl = len(sname)
    if snl == 2:
        # Just a name and a surname: how boring...
        name = '%s, %s' % (sname[1], sname[0])
    elif snl > 2:
        lsname = [x.lower() for x in sname]
        if snl == 3:
            _indexes = (0, snl-2)
        else:
            _indexes = (0, snl-2, snl-3)
        # Check for common surname prefixes at the beginning and near the end.
        for index in _indexes:
            if lsname[index] not in _sname_suffixes:
                continue
            try:
                # Build the surname.
                surn = '%s %s' % (sname[index], sname[index+1])
                del sname[index]
                del sname[index]
                try:
                    # Handle the "Jr." after the name.
                    if lsname[index+2].startswith('jr'):
                        surn += ' %s' % sname[index]
                        del sname[index]
                except (IndexError, ValueError):
                    pass
                name = '%s, %s' % (surn, ' '.join(sname))
                break
            except ValueError:
                continue
        else:
            name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
    return name

def normalize_name(name):
    """Return a name in the normal "Name Surname" format.

    >>> normalize_name('Godard, Jean Luc')
    'Jean Luc Godard'

    >>> normalize_name('Ivanov-Vano, Ivan')
    'Ivan Ivanov-Vano'

    >>> normalize_name('Van Sant, Gus')
    'Gus Van Sant'

    >>> normalize_name('De Palma, Brian')
    'Brian De Palma'
    """
    sname = name.split(', ')
    if len(sname) == 2:
        name = '%s %s' % (sname[1], sname[0])
    return name

def normalize_path(path):
    path = path.replace(':', '_').replace('/', '_')
    if path.endswith('.'):
        path = path[:-1] + '_'
    return path

def strip_accents(s):
    if isinstance(s, str):
        s = s.decode('utf-8')
    return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))