python-ox/ox/normalize.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import re
import unicodedata


_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
             "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
             'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
             'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
             'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
             u'\xd4\xef', u'\xcf\xe9')

# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
    if article[-1] not in ("'", '-'):
        article += ' '
    _spArticles.append(article)

_noarticles = (
    'los angeles',
    'i am ',
    'i be area',
    'i call ',
    'i come ',
    'i confess',
    'i hired ',
    'i killed ',
    'i know ',
    'i live ',
    'i love',
    'i married',
    'i never',
    'i shot',
    'i start',
    'i was',
)

def canonical_title(title):
    """Return the title in the canonic format 'Movie Title, The'.
    
    >>> canonical_title('The Movie Title')
    'Movie Title, The'

    >>> canonical_title('Los Angeles Plays Itself')
    'Los Angeles Plays Itself'
    """
    try:
        if title.split(', ')[-1].lower() in _articlesDict:
            return title
    except IndexError:
        pass
    ltitle = title.lower()
    for start in _noarticles:
        if ltitle.startswith(start):
            return title
    for article in _spArticles:
        if ltitle.startswith(article):
            lart = len(article)
            title = '%s, %s' % (title[lart:], title[:lart])
            if article[-1] == ' ':
                title = title[:-1]
            break
    ## XXX: an attempt using a dictionary lookup.
    ##for artSeparator in (' ', "'", '-'):
    ##    article = _articlesDict.get(ltitle.split(artSeparator)[0])
    ##    if article is not None:
    ##        lart = len(article)
    ##        # check titles like "una", "I'm Mad" and "L'abbacchio".
    ##        if title[lart:] == '' or (artSeparator != ' ' and
    ##                                title[lart:][1] != artSeparator): continue
    ##        title = '%s, %s' % (title[lart:], title[:lart])
    ##        if artSeparator == ' ': title = title[1:]
    ##        break
    return title

def normalize_title(title):
    """Return the title in the normal "The Title" format.

    >>> normalize_title('Movie Title, The')
    'The Movie Title'
    """
    stitle = title.split(', ')
    if len(stitle) > 1 and stitle[-1].lower() in _articlesDict:
        sep = ' '
        if stitle[-1][-1] in ("'", '-'):
            sep = ''
        title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
    return title

def normalize_imdbid(imdbId):
    """Return 7 digit imdbId.

    >>> normalize_imdbid('http://www.imdb.com/title/tt0159206/')
    '0159206'
    >>> normalize_imdbid(159206)
    '0159206'
    >>> normalize_imdbid('tt0159206')
    '0159206'
    """
    if isinstance(imdbId, str):
        imdbId = re.sub(r'.*(\d{7}).*', '\\1', imdbId)
    elif isinstance(imdbId, int):
        imdbId = "%07d" % imdbId
    return imdbId


# Common suffixes in surnames.
_sname_suffixes = (
    'al', 'ben', 'da', 'de', 'del', 'den', 'der', 'des', 'di', 'dos', 'du',
    'e', 'el', 'la', 'le', 'the', 'vom', 'von', 'van', 'y'
)

def canonical_name(name):
    """Return the given name in canonical "Surname, Name" format.
    It assumes that name is in the 'Name Surname' format.
    
    >>> canonical_name('Jean Luc Godard')
    'Godard, Jean Luc'

    >>> canonical_name('Ivan Ivanov-Vano')
    'Ivanov-Vano, Ivan'

    >>> canonical_name('Gus Van Sant')
    'Van Sant, Gus'

    >>> canonical_name('Brian De Palma')
    'De Palma, Brian'
    """

    # XXX: some statistics (over 1852406 names):
    #      - just a surname:                 51921
    #      - single surname, single name:  1792759
    #      - composed surname, composed name: 7726
    #      - composed surname, single name:  55623
    #        (2: 49259, 3: 5502, 4: 551)
    #      - single surname, composed name: 186604
    #        (2: 178315, 3: 6573, 4: 1219, 5: 352)
    # Don't convert names already in the canonical format.
    if name in ('Unknown Director', ):
        return name
    if name.find(', ') != -1:
        return name
    sname = name.split(' ')
    snl = len(sname)
    if snl == 2:
        # Just a name and a surname: how boring...
        name = '%s, %s' % (sname[1], sname[0])
    elif snl > 2:
        lsname = [x.lower() for x in sname]
        if snl == 3:
            _indexes = (0, snl-2)
        else:
            _indexes = (0, snl-2, snl-3)
        # Check for common surname prefixes at the beginning and near the end.
        for index in _indexes:
            if lsname[index] not in _sname_suffixes:
                continue
            try:
                # Build the surname.
                surn = '%s %s' % (sname[index], sname[index+1])
                del sname[index]
                del sname[index]
                try:
                    # Handle the "Jr." after the name.
                    if lsname[index+2].startswith('jr'):
                        surn += ' %s' % sname[index]
                        del sname[index]
                except (IndexError, ValueError):
                    pass
                name = '%s, %s' % (surn, ' '.join(sname))
                break
            except ValueError:
                continue
        else:
            name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
    return name

def normalize_name(name):
    """Return a name in the normal "Name Surname" format.
    
    >>> normalize_name('Godard, Jean Luc')
    'Jean Luc Godard'

    >>> normalize_name('Ivanov-Vano, Ivan')
    'Ivan Ivanov-Vano'

    >>> normalize_name('Van Sant, Gus')
    'Gus Van Sant'

    >>> normalize_name('De Palma, Brian')
    'Brian De Palma'
    """
    sname = name.split(', ')
    if len(sname) == 2:
        name = '%s %s' % (sname[1], sname[0])
    return name

def normalize_path(path):
    path = path.replace(':', '_').replace('/', '_')
    if path.endswith('.'):
        path = path[:-1] + '_'
    return path

def strip_accents(s):
    if isinstance(s, str):
        s = s.decode('utf-8')
    return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
known unknowns 2010-05-17 09:09:40 +00:00			`# -- coding: utf-8 --`
move and rename some 2008-07-06 13:00:06 +00:00			`# vi:si:et:sw=4:sts=4:ts=4`
known unknowns 2010-05-17 09:09:40 +00:00			`# GPL 2008`
add normalizeImdbId 2008-04-29 16:08:15 +00:00			`import re`
stripAccents 2010-09-01 12:55:52 +00:00			`import unicodedata`
add normalize title 2008-04-28 09:35:20 +00:00
basestring->six.string_types 2015-12-25 15:08:55 +00:00
add normalize title 2008-04-28 09:35:20 +00:00			`_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',`
			`'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',`
			`'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',`
			`'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',`
			`u'\xd4\xef', u'\xcf\xe9')`
add normalize title 2008-04-28 09:35:20 +00:00
			`# Articles in a dictionary.`
			`_articlesDict = dict([(x, x) for x in _articles])`
			`_spArticles = []`
			`for article in _articles:`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`if article[-1] not in ("'", '-'):`
			`article += ' '`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`_spArticles.append(article)`
add normalize title 2008-04-28 09:35:20 +00:00
sort title, exclude i verb, and los angeles 2010-07-23 10:10:26 +00:00			`_noarticles = (`
			`'los angeles',`
			`'i am ',`
			`'i be area',`
			`'i call ',`
			`'i come ',`
			`'i confess',`
			`'i hired ',`
			`'i killed ',`
			`'i know ',`
			`'i live ',`
			`'i love',`
			`'i married',`
			`'i never',`
			`'i shot',`
			`'i start',`
			`'i was',`
			`)`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`def canonical_title(title):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""Return the title in the canonic format 'Movie Title, The'.`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> canonical_title('The Movie Title')`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'Movie Title, The'`
sort title, exclude i verb, and los angeles 2010-07-23 10:10:26 +00:00
replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> canonical_title('Los Angeles Plays Itself')`
sort title, exclude i verb, and los angeles 2010-07-23 10:10:26 +00:00			`'Los Angeles Plays Itself'`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""`
			`try:`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`if title.split(', ')[-1].lower() in _articlesDict:`
			`return title`
			`except IndexError:`
			`pass`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`ltitle = title.lower()`
sort title, exclude i verb, and los angeles 2010-07-23 10:10:26 +00:00			`for start in _noarticles:`
			`if ltitle.startswith(start):`
			`return title`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`for article in _spArticles:`
			`if ltitle.startswith(article):`
			`lart = len(article)`
			`title = '%s, %s' % (title[lart:], title[:lart])`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`if article[-1] == ' ':`
			`title = title[:-1]`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`break`
			`## XXX: an attempt using a dictionary lookup.`
			`##for artSeparator in (' ', "'", '-'):`
			`## article = _articlesDict.get(ltitle.split(artSeparator)[0])`
			`## if article is not None:`
			`## lart = len(article)`
			`## # check titles like "una", "I'm Mad" and "L'abbacchio".`
			`## if title[lart:] == '' or (artSeparator != ' ' and`
			`## title[lart:][1] != artSeparator): continue`
			`## title = '%s, %s' % (title[lart:], title[:lart])`
			`## if artSeparator == ' ': title = title[1:]`
			`## break`
			`return title`
add normalize title 2008-04-28 09:35:20 +00:00
replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`def normalize_title(title):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""Return the title in the normal "The Title" format.`
add test and cleanup some errors found while doing so 2008-05-05 18:12:27 +00:00
replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> normalize_title('Movie Title, The')`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'The Movie Title'`
			`"""`
			`stitle = title.split(', ')`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`if len(stitle) > 1 and stitle[-1].lower() in _articlesDict:`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`sep = ' '`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`if stitle[-1][-1] in ("'", '-'):`
			`sep = ''`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))`
			`return title`
add normalize title 2008-04-28 09:35:20 +00:00
replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`def normalize_imdbid(imdbId):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`"""Return 7 digit imdbId.`
add test and cleanup some errors found while doing so 2008-05-05 18:12:27 +00:00
replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> normalize_imdbid('http://www.imdb.com/title/tt0159206/')`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'0159206'`
replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> normalize_imdbid(159206)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'0159206'`
replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> normalize_imdbid('tt0159206')`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'0159206'`
			`"""`
drop six and python2 support 2023-07-27 11:07:13 +00:00			`if isinstance(imdbId, str):`
more raw regexp strings 2024-08-30 11:30:47 +00:00			`imdbId = re.sub(r'.(\d{7}).', '\\1', imdbId)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`elif isinstance(imdbId, int):`
			`imdbId = "%07d" % imdbId`
			`return imdbId`
add normalizeImdbId 2008-04-29 16:08:15 +00:00
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00
			`# Common suffixes in surnames.`
add 'ben' and 'le' to surname suffixes (which are actually prefixes, btw) 2012-09-12 19:45:34 +00:00			`_sname_suffixes = (`
(minor change) 2012-09-23 13:21:38 +00:00			`'al', 'ben', 'da', 'de', 'del', 'den', 'der', 'des', 'di', 'dos', 'du',`
add 'ben' and 'le' to surname suffixes (which are actually prefixes, btw) 2012-09-12 19:45:34 +00:00			`'e', 'el', 'la', 'le', 'the', 'vom', 'von', 'van', 'y'`
			`)`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00
replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`def canonical_name(name):`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`"""Return the given name in canonical "Surname, Name" format.`
			`It assumes that name is in the 'Name Surname' format.`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> canonical_name('Jean Luc Godard')`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`'Godard, Jean Luc'`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> canonical_name('Ivan Ivanov-Vano')`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`'Ivanov-Vano, Ivan'`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> canonical_name('Gus Van Sant')`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`'Van Sant, Gus'`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> canonical_name('Brian De Palma')`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`'De Palma, Brian'`
			`"""`

			`# XXX: some statistics (over 1852406 names):`
			`# - just a surname: 51921`
			`# - single surname, single name: 1792759`
			`# - composed surname, composed name: 7726`
			`# - composed surname, single name: 55623`
			`# (2: 49259, 3: 5502, 4: 551)`
			`# - single surname, composed name: 186604`
			`# (2: 178315, 3: 6573, 4: 1219, 5: 352)`
			`# Don't convert names already in the canonical format.`
known unknowns 2010-05-17 09:09:40 +00:00			`if name in ('Unknown Director', ):`
			`return name`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`if name.find(', ') != -1:`
			`return name`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`sname = name.split(' ')`
			`snl = len(sname)`
			`if snl == 2:`
			`# Just a name and a surname: how boring...`
			`name = '%s, %s' % (sname[1], sname[0])`
			`elif snl > 2:`
			`lsname = [x.lower() for x in sname]`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`if snl == 3:`
			`_indexes = (0, snl-2)`
			`else:`
			`_indexes = (0, snl-2, snl-3)`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`# Check for common surname prefixes at the beginning and near the end.`
			`for index in _indexes:`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`if lsname[index] not in _sname_suffixes:`
			`continue`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`try:`
			`# Build the surname.`
			`surn = '%s %s' % (sname[index], sname[index+1])`
			`del sname[index]`
			`del sname[index]`
			`try:`
			`# Handle the "Jr." after the name.`
			`if lsname[index+2].startswith('jr'):`
			`surn += ' %s' % sname[index]`
			`del sname[index]`
			`except (IndexError, ValueError):`
			`pass`
			`name = '%s, %s' % (surn, ' '.join(sname))`
			`break`
			`except ValueError:`
			`continue`
			`else:`
			`name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))`
			`return name`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`def normalize_name(name):`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`"""Return a name in the normal "Name Surname" format.`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> normalize_name('Godard, Jean Luc')`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`'Jean Luc Godard'`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> normalize_name('Ivanov-Vano, Ivan')`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`'Ivan Ivanov-Vano'`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> normalize_name('Van Sant, Gus')`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`'Gus Van Sant'`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`>>> normalize_name('De Palma, Brian')`
canonicalName and normalizeName 2008-05-06 11:16:33 +00:00			`'Brian De Palma'`
			`"""`
			`sname = name.split(', ')`
			`if len(sname) == 2:`
			`name = '%s %s' % (sname[1], sname[0])`
			`return name`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`def normalize_path(path):`
new user agent, fix path 2010-09-01 17:57:29 +00:00			`path = path.replace(':', '_').replace('/', '_')`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`if path.endswith('.'):`
			`path = path[:-1] + '_'`
normalizePath 2010-09-01 11:31:18 +00:00			`return path`

replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`def strip_accents(s):`
stripAccents 2010-09-01 12:55:52 +00:00			`if isinstance(s, str):`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`s = s.decode('utf-8')`
stripAccents 2010-09-01 12:55:52 +00:00			`return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))`