python-ox/ox/normalize.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import re
import unicodedata

_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
             "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
             'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
             'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
             'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
             u'\xd4\xef', u'\xcf\xe9')

# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
    if article[-1] not in ("'", '-'): article += ' '
    _spArticles.append(article)

_noarticles = (
    'los angeles',
    'i am ',
    'i be area',
    'i call ',
    'i come ',
    'i confess',
    'i hired ',
    'i killed ',
    'i know ',
    'i live ',
    'i love',
    'i married',
    'i never',
    'i shot',
    'i start',
    'i was',
)

def canonicalTitle(title):
    """Return the title in the canonic format 'Movie Title, The'.
    
    >>> canonicalTitle('The Movie Title')
    'Movie Title, The'

    >>> canonicalTitle('Los Angeles Plays Itself')
    'Los Angeles Plays Itself'
    """
    try:
        if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
    except IndexError: pass
    ltitle = title.lower()
    for start in _noarticles:
        if ltitle.startswith(start):
            return title
    for article in _spArticles:
        if ltitle.startswith(article):
            lart = len(article)
            title = '%s, %s' % (title[lart:], title[:lart])
            if article[-1] == ' ': title = title[:-1]
            break
    ## XXX: an attempt using a dictionary lookup.
    ##for artSeparator in (' ', "'", '-'):
    ##    article = _articlesDict.get(ltitle.split(artSeparator)[0])
    ##    if article is not None:
    ##        lart = len(article)
    ##        # check titles like "una", "I'm Mad" and "L'abbacchio".
    ##        if title[lart:] == '' or (artSeparator != ' ' and
    ##                                title[lart:][1] != artSeparator): continue
    ##        title = '%s, %s' % (title[lart:], title[:lart])
    ##        if artSeparator == ' ': title = title[1:]
    ##        break
    return title

def normalizeTitle(title):
    """Return the title in the normal "The Title" format.

    >>> normalizeTitle('Movie Title, The')
    'The Movie Title'
    """
    stitle = title.split(', ')
    if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
        sep = ' '
        if stitle[-1][-1] in ("'", '-'): sep = ''
        title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
    return title

def normalizeImdbId(imdbId):
    """Return 7 digit imdbId.

    >>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
    '0159206'
    >>> normalizeImdbId(159206)
    '0159206'
    >>> normalizeImdbId('tt0159206')
    '0159206'
    """
    if isinstance(imdbId, basestring):
        imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
    elif isinstance(imdbId, int):
        imdbId = "%07d" % imdbId
    return imdbId


# Common suffixes in surnames.
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
                    'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')

def canonicalName(name):
    """Return the given name in canonical "Surname, Name" format.
    It assumes that name is in the 'Name Surname' format.
    
    >>> canonicalName('Jean Luc Godard')
    'Godard, Jean Luc'

    >>> canonicalName('Ivan Ivanov-Vano')
    'Ivanov-Vano, Ivan'

    >>> canonicalName('Gus Van Sant')
    'Van Sant, Gus'

    >>> canonicalName('Brian De Palma')
    'De Palma, Brian'
    """

    # XXX: some statistics (over 1852406 names):
    #      - just a surname:                 51921
    #      - single surname, single name:  1792759
    #      - composed surname, composed name: 7726
    #      - composed surname, single name:  55623
    #        (2: 49259, 3: 5502, 4: 551)
    #      - single surname, composed name: 186604
    #        (2: 178315, 3: 6573, 4: 1219, 5: 352)
    # Don't convert names already in the canonical format.
    if name in ('Unknown Director', ):
        return name
    if name.find(', ') != -1: return name
    sname = name.split(' ')
    snl = len(sname)
    if snl == 2:
        # Just a name and a surname: how boring...
        name = '%s, %s' % (sname[1], sname[0])
    elif snl > 2:
        lsname = [x.lower() for x in sname]
        if snl == 3: _indexes = (0, snl-2)
        else: _indexes = (0, snl-2, snl-3)
        # Check for common surname prefixes at the beginning and near the end.
        for index in _indexes:
            if lsname[index] not in _sname_suffixes: continue
            try:
                # Build the surname.
                surn = '%s %s' % (sname[index], sname[index+1])
                del sname[index]
                del sname[index]
                try:
                    # Handle the "Jr." after the name.
                    if lsname[index+2].startswith('jr'):
                        surn += ' %s' % sname[index]
                        del sname[index]
                except (IndexError, ValueError):
                    pass
                name = '%s, %s' % (surn, ' '.join(sname))
                break
            except ValueError:
                continue
        else:
            name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
    return name

def normalizeName(name):
    """Return a name in the normal "Name Surname" format.
    
    >>> normalizeName('Godard, Jean Luc')
    'Jean Luc Godard'

    >>> normalizeName('Ivanov-Vano, Ivan')
    'Ivan Ivanov-Vano'

    >>> normalizeName('Van Sant, Gus')
    'Gus Van Sant'

    >>> normalizeName('De Palma, Brian')
    'Brian De Palma'
    """
    sname = name.split(', ')
    if len(sname) == 2:
        name = '%s %s' % (sname[1], sname[0])
    return name

def normalizePath(path):
    path = path.replace(':', '_').replace('/', '_')
    if path.endswith('.'): path = path[:-1] + '_'
    return path

def stripAccents(s):
    if isinstance(s, str):
        s = unicode(s)
    return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
known unknowns 2010-05-17 09:09:40 +00:00			`# -- coding: utf-8 --`
move and rename some 2008-07-06 13:00:06 +00:00			`# vi:si:et:sw=4:sts=4:ts=4`
known unknowns 2010-05-17 09:09:40 +00:00			`# GPL 2008`
add normalizeImdbId 2008-04-29 16:08:15 +00:00			`import re`
stripAccents 2010-09-01 12:55:52 +00:00			`import unicodedata`
add normalize title 2008-04-28 09:35:20 +00:00
			`_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',`
			`'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',`
			`'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',`
			`'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',`
			`u'\xd4\xef', u'\xcf\xe9')`
add normalize title 2008-04-28 09:35:20 +00:00
			`# Articles in a dictionary.`
			`_articlesDict = dict([(x, x) for x in _articles])`
			`_spArticles = []`
			`for article in _articles:`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`if article[-1] not in ("'", '-'): article += ' '`
			`_spArticles.append(article)`
add normalize title 2008-04-28 09:35:20 +00:00
sort title, exclude i verb, and los angeles 2010-07-23 10:10:26 +00:00			`_noarticles = (`
			`'los angeles',`
			`'i am ',`
			`'i be area',`
			`'i call ',`
			`'i come ',`
			`'i confess',`
			`'i hired ',`
			`'i killed ',`
			`'i know ',`
			`'i live ',`
			`'i love',`
			`'i married',`
			`'i never',`
			`'i shot',`
			`'i start',`
			`'i was',`
			`)`

add normalize title 2008-04-28 09:35:20 +00:00			`def canonicalTitle(title):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""Return the title in the canonic format 'Movie Title, The'.`

			`>>> canonicalTitle('The Movie Title')`
			`'Movie Title, The'`
sort title, exclude i verb, and los angeles 2010-07-23 10:10:26 +00:00
			`>>> canonicalTitle('Los Angeles Plays Itself')`
			`'Los Angeles Plays Itself'`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""`
			`try:`
			`if _articlesDict.has_key(title.split(', ')[-1].lower()): return title`
			`except IndexError: pass`
			`ltitle = title.lower()`
sort title, exclude i verb, and los angeles 2010-07-23 10:10:26 +00:00			`for start in _noarticles:`
			`if ltitle.startswith(start):`
			`return title`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`for article in _spArticles:`
			`if ltitle.startswith(article):`
			`lart = len(article)`
			`title = '%s, %s' % (title[lart:], title[:lart])`
			`if article[-1] == ' ': title = title[:-1]`
			`break`
			`## XXX: an attempt using a dictionary lookup.`
			`##for artSeparator in (' ', "'", '-'):`
			`## article = _articlesDict.get(ltitle.split(artSeparator)[0])`
			`## if article is not None:`
			`## lart = len(article)`
			`## # check titles like "una", "I'm Mad" and "L'abbacchio".`
			`## if title[lart:] == '' or (artSeparator != ' ' and`
			`## title[lart:][1] != artSeparator): continue`
			`## title = '%s, %s' % (title[lart:], title[:lart])`
			`## if artSeparator == ' ': title = title[1:]`
			`## break`
			`return title`
add normalize title 2008-04-28 09:35:20 +00:00
			`def normalizeTitle(title):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""Return the title in the normal "The Title" format.`
add test and cleanup some errors found while doing so 2008-05-05 18:12:27 +00:00
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`>>> normalizeTitle('Movie Title, The')`
			`'The Movie Title'`
			`"""`
			`stitle = title.split(', ')`
			`if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):`
			`sep = ' '`
			`if stitle[-1][-1] in ("'", '-'): sep = ''`
			`title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))`
			`return title`
add normalize title 2008-04-28 09:35:20 +00:00
add normalizeImdbId 2008-04-29 16:08:15 +00:00			`def normalizeImdbId(imdbId):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""Return 7 digit imdbId.`
add test and cleanup some errors found while doing so 2008-05-05 18:12:27 +00:00
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')`
			`'0159206'`
			`>>> normalizeImdbId(159206)`
			`'0159206'`
			`>>> normalizeImdbId('tt0159206')`
			`'0159206'`
			`"""`
			`if isinstance(imdbId, basestring):`
			`imdbId = re.sub('.(\d{7}).', '\\1', imdbId)`
			`elif isinstance(imdbId, int):`
			`imdbId = "%07d" % imdbId`
			`return imdbId`
add normalizeImdbId 2008-04-29 16:08:15 +00:00
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00
			`# Common suffixes in surnames.`
			`_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',`
vom possible name suffix 2009-08-02 18:25:03 +00:00			`'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00
			`def canonicalName(name):`
			`"""Return the given name in canonical "Surname, Name" format.`
			`It assumes that name is in the 'Name Surname' format.`

			`>>> canonicalName('Jean Luc Godard')`
			`'Godard, Jean Luc'`

			`>>> canonicalName('Ivan Ivanov-Vano')`
			`'Ivanov-Vano, Ivan'`

			`>>> canonicalName('Gus Van Sant')`
			`'Van Sant, Gus'`

			`>>> canonicalName('Brian De Palma')`
			`'De Palma, Brian'`
			`"""`

			`# XXX: some statistics (over 1852406 names):`
			`# - just a surname: 51921`
			`# - single surname, single name: 1792759`
			`# - composed surname, composed name: 7726`
			`# - composed surname, single name: 55623`
			`# (2: 49259, 3: 5502, 4: 551)`
			`# - single surname, composed name: 186604`
			`# (2: 178315, 3: 6573, 4: 1219, 5: 352)`
			`# Don't convert names already in the canonical format.`
known unknowns 2010-05-17 09:09:40 +00:00			`if name in ('Unknown Director', ):`
			`return name`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`if name.find(', ') != -1: return name`
			`sname = name.split(' ')`
			`snl = len(sname)`
			`if snl == 2:`
			`# Just a name and a surname: how boring...`
			`name = '%s, %s' % (sname[1], sname[0])`
			`elif snl > 2:`
			`lsname = [x.lower() for x in sname]`
			`if snl == 3: _indexes = (0, snl-2)`
			`else: _indexes = (0, snl-2, snl-3)`
			`# Check for common surname prefixes at the beginning and near the end.`
			`for index in _indexes:`
			`if lsname[index] not in _sname_suffixes: continue`
			`try:`
			`# Build the surname.`
			`surn = '%s %s' % (sname[index], sname[index+1])`
			`del sname[index]`
			`del sname[index]`
			`try:`
			`# Handle the "Jr." after the name.`
			`if lsname[index+2].startswith('jr'):`
			`surn += ' %s' % sname[index]`
			`del sname[index]`
			`except (IndexError, ValueError):`
			`pass`
			`name = '%s, %s' % (surn, ' '.join(sname))`
			`break`
			`except ValueError:`
			`continue`
			`else:`
			`name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))`
			`return name`

			`def normalizeName(name):`
			`"""Return a name in the normal "Name Surname" format.`

			`>>> normalizeName('Godard, Jean Luc')`
			`'Jean Luc Godard'`

			`>>> normalizeName('Ivanov-Vano, Ivan')`
			`'Ivan Ivanov-Vano'`

			`>>> normalizeName('Van Sant, Gus')`
			`'Gus Van Sant'`

			`>>> normalizeName('De Palma, Brian')`
			`'Brian De Palma'`
			`"""`
			`sname = name.split(', ')`
			`if len(sname) == 2:`
			`name = '%s %s' % (sname[1], sname[0])`
			`return name`

normalizePath 2010-09-01 11:31:18 +00:00			`def normalizePath(path):`
new user agent, fix path 2010-09-01 17:57:29 +00:00			`path = path.replace(':', '_').replace('/', '_')`
normalizePath 2010-09-01 11:31:18 +00:00			`if path.endswith('.'): path = path[:-1] + '_'`
			`return path`

stripAccents 2010-09-01 12:55:52 +00:00			`def stripAccents(s):`
			`if isinstance(s, str):`
			`s = unicode(s)`
			`return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))`