# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 # GPL 2008 import re import unicodedata _articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el', "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de', 'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo', 'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem', 'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els', u'\xd4\xef', u'\xcf\xe9') # Articles in a dictionary. _articlesDict = dict([(x, x) for x in _articles]) _spArticles = [] for article in _articles: if article[-1] not in ("'", '-'): article += ' ' _spArticles.append(article) _noarticles = ( 'los angeles', 'i am ', 'i be area', 'i call ', 'i come ', 'i confess', 'i hired ', 'i killed ', 'i know ', 'i live ', 'i love', 'i married', 'i never', 'i shot', 'i start', 'i was', ) def canonical_title(title): """Return the title in the canonic format 'Movie Title, The'. >>> canonical_title('The Movie Title') 'Movie Title, The' >>> canonical_title('Los Angeles Plays Itself') 'Los Angeles Plays Itself' """ try: if title.split(', ')[-1].lower() in _articlesDict: return title except IndexError: pass ltitle = title.lower() for start in _noarticles: if ltitle.startswith(start): return title for article in _spArticles: if ltitle.startswith(article): lart = len(article) title = '%s, %s' % (title[lart:], title[:lart]) if article[-1] == ' ': title = title[:-1] break ## XXX: an attempt using a dictionary lookup. ##for artSeparator in (' ', "'", '-'): ## article = _articlesDict.get(ltitle.split(artSeparator)[0]) ## if article is not None: ## lart = len(article) ## # check titles like "una", "I'm Mad" and "L'abbacchio". ## if title[lart:] == '' or (artSeparator != ' ' and ## title[lart:][1] != artSeparator): continue ## title = '%s, %s' % (title[lart:], title[:lart]) ## if artSeparator == ' ': title = title[1:] ## break return title def normalize_title(title): """Return the title in the normal "The Title" format. >>> normalize_title('Movie Title, The') 'The Movie Title' """ stitle = title.split(', ') if len(stitle) > 1 and stitle[-1].lower() in _articlesDict: sep = ' ' if stitle[-1][-1] in ("'", '-'): sep = '' title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1])) return title def normalize_imdbid(imdbId): """Return 7 digit imdbId. >>> normalize_imdbid('http://www.imdb.com/title/tt0159206/') '0159206' >>> normalize_imdbid(159206) '0159206' >>> normalize_imdbid('tt0159206') '0159206' """ if isinstance(imdbId, str): imdbId = re.sub(r'.*(\d{7}).*', '\\1', imdbId) elif isinstance(imdbId, int): imdbId = "%07d" % imdbId return imdbId # Common suffixes in surnames. _sname_suffixes = ( 'al', 'ben', 'da', 'de', 'del', 'den', 'der', 'des', 'di', 'dos', 'du', 'e', 'el', 'la', 'le', 'the', 'vom', 'von', 'van', 'y' ) def canonical_name(name): """Return the given name in canonical "Surname, Name" format. It assumes that name is in the 'Name Surname' format. >>> canonical_name('Jean Luc Godard') 'Godard, Jean Luc' >>> canonical_name('Ivan Ivanov-Vano') 'Ivanov-Vano, Ivan' >>> canonical_name('Gus Van Sant') 'Van Sant, Gus' >>> canonical_name('Brian De Palma') 'De Palma, Brian' """ # XXX: some statistics (over 1852406 names): # - just a surname: 51921 # - single surname, single name: 1792759 # - composed surname, composed name: 7726 # - composed surname, single name: 55623 # (2: 49259, 3: 5502, 4: 551) # - single surname, composed name: 186604 # (2: 178315, 3: 6573, 4: 1219, 5: 352) # Don't convert names already in the canonical format. if name in ('Unknown Director', ): return name if name.find(', ') != -1: return name sname = name.split(' ') snl = len(sname) if snl == 2: # Just a name and a surname: how boring... name = '%s, %s' % (sname[1], sname[0]) elif snl > 2: lsname = [x.lower() for x in sname] if snl == 3: _indexes = (0, snl-2) else: _indexes = (0, snl-2, snl-3) # Check for common surname prefixes at the beginning and near the end. for index in _indexes: if lsname[index] not in _sname_suffixes: continue try: # Build the surname. surn = '%s %s' % (sname[index], sname[index+1]) del sname[index] del sname[index] try: # Handle the "Jr." after the name. if lsname[index+2].startswith('jr'): surn += ' %s' % sname[index] del sname[index] except (IndexError, ValueError): pass name = '%s, %s' % (surn, ' '.join(sname)) break except ValueError: continue else: name = '%s, %s' % (sname[-1], ' '.join(sname[:-1])) return name def normalize_name(name): """Return a name in the normal "Name Surname" format. >>> normalize_name('Godard, Jean Luc') 'Jean Luc Godard' >>> normalize_name('Ivanov-Vano, Ivan') 'Ivan Ivanov-Vano' >>> normalize_name('Van Sant, Gus') 'Gus Van Sant' >>> normalize_name('De Palma, Brian') 'Brian De Palma' """ sname = name.split(', ') if len(sname) == 2: name = '%s %s' % (sname[1], sname[0]) return name def normalize_path(path): path = path.replace(':', '_').replace('/', '_') if path.endswith('.'): path = path[:-1] + '_' return path def strip_accents(s): if isinstance(s, str): s = s.decode('utf-8') return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))