# -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 import re _articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el', "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de', 'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo', 'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem', 'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els', u'\xd4\xef', u'\xcf\xe9') # Articles in a dictionary. _articlesDict = dict([(x, x) for x in _articles]) _spArticles = [] for article in _articles: if article[-1] not in ("'", '-'): article += ' ' _spArticles.append(article) def canonicalTitle(title): """Return the title in the canonic format 'Movie Title, The'. >>> canonicalTitle('The Movie Title') 'Movie Title, The' """ try: if _articlesDict.has_key(title.split(', ')[-1].lower()): return title except IndexError: pass ltitle = title.lower() for article in _spArticles: if ltitle.startswith(article): lart = len(article) title = '%s, %s' % (title[lart:], title[:lart]) if article[-1] == ' ': title = title[:-1] break ## XXX: an attempt using a dictionary lookup. ##for artSeparator in (' ', "'", '-'): ## article = _articlesDict.get(ltitle.split(artSeparator)[0]) ## if article is not None: ## lart = len(article) ## # check titles like "una", "I'm Mad" and "L'abbacchio". ## if title[lart:] == '' or (artSeparator != ' ' and ## title[lart:][1] != artSeparator): continue ## title = '%s, %s' % (title[lart:], title[:lart]) ## if artSeparator == ' ': title = title[1:] ## break return title def normalizeTitle(title): """Return the title in the normal "The Title" format. >>> normalizeTitle('Movie Title, The') 'The Movie Title' """ stitle = title.split(', ') if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()): sep = ' ' if stitle[-1][-1] in ("'", '-'): sep = '' title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1])) return title def normalizeImdbId(imdbId): """Return 7 digit imdbId. >>> normalizeImdbId('http://www.imdb.com/title/tt0159206/') '0159206' >>> normalizeImdbId(159206) '0159206' >>> normalizeImdbId('tt0159206') '0159206' """ if isinstance(imdbId, basestring): imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId) elif isinstance(imdbId, int): imdbId = "%07d" % imdbId return imdbId # Common suffixes in surnames. _sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van', 'e', 'von', 'the', 'di', 'du', 'el', 'al') def canonicalName(name): """Return the given name in canonical "Surname, Name" format. It assumes that name is in the 'Name Surname' format. >>> canonicalName('Jean Luc Godard') 'Godard, Jean Luc' >>> canonicalName('Ivan Ivanov-Vano') 'Ivanov-Vano, Ivan' >>> canonicalName('Gus Van Sant') 'Van Sant, Gus' >>> canonicalName('Brian De Palma') 'De Palma, Brian' """ # XXX: some statistics (over 1852406 names): # - just a surname: 51921 # - single surname, single name: 1792759 # - composed surname, composed name: 7726 # - composed surname, single name: 55623 # (2: 49259, 3: 5502, 4: 551) # - single surname, composed name: 186604 # (2: 178315, 3: 6573, 4: 1219, 5: 352) # Don't convert names already in the canonical format. if name.find(', ') != -1: return name sname = name.split(' ') snl = len(sname) if snl == 2: # Just a name and a surname: how boring... name = '%s, %s' % (sname[1], sname[0]) elif snl > 2: lsname = [x.lower() for x in sname] if snl == 3: _indexes = (0, snl-2) else: _indexes = (0, snl-2, snl-3) # Check for common surname prefixes at the beginning and near the end. for index in _indexes: if lsname[index] not in _sname_suffixes: continue try: # Build the surname. surn = '%s %s' % (sname[index], sname[index+1]) del sname[index] del sname[index] try: # Handle the "Jr." after the name. if lsname[index+2].startswith('jr'): surn += ' %s' % sname[index] del sname[index] except (IndexError, ValueError): pass name = '%s, %s' % (surn, ' '.join(sname)) break except ValueError: continue else: name = '%s, %s' % (sname[-1], ' '.join(sname[:-1])) return name def normalizeName(name): """Return a name in the normal "Name Surname" format. >>> normalizeName('Godard, Jean Luc') 'Jean Luc Godard' >>> normalizeName('Ivanov-Vano, Ivan') 'Ivan Ivanov-Vano' >>> normalizeName('Van Sant, Gus') 'Gus Van Sant' >>> normalizeName('De Palma, Brian') 'Brian De Palma' """ sname = name.split(', ') if len(sname) == 2: name = '%s %s' % (sname[1], sname[0]) return name