From 50e9f3e8191ead815a03ce1e9e9a77df59e3a184 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Fri, 10 Aug 2007 12:19:11 +0000
Subject: [PATCH] normalizeTitle

---
 scrapeit/imdb.py         |   9 +-
 scrapeit/imdbpy_utils.py | 899 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 904 insertions(+), 4 deletions(-)
 create mode 100644 scrapeit/imdbpy_utils.py

diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py
index fbd023a..9a5a7b0 100644
--- a/scrapeit/imdb.py
+++ b/scrapeit/imdb.py
@@ -15,6 +15,7 @@ from utils import stripTags, htmldecode
 
 import utils
 import chardet
+import imdbpy_utils
 
 cache_base = "/var/cache/scrapeit/cache/"
 
@@ -183,18 +184,18 @@ class IMDb:
         title = title.replace(t, '')
     if title.find(u'\xa0') > -1:
       title = title[:title.find(u'\xa0')]
-    title = title.strip()
+    title = imdbpy_utils.normalizeTitle(title.strip())
     if title.startswith('"') and title.endswith('"'):
-      title = title[1:-1]
+      title = imdbpy_utils.normalizeTitle(title[1:-1])
     elif title.startswith('"') and title.find('"',1) > 0 and \
         title.find('"',1) == title.rfind('"'):
         se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
         if se:
           se = se[0]
           se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
-          title = title[1:title.rfind('"')] + se + title[title.rfind('"')+1:]
+          title = imdbpy_utils.normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
         else:
-          title = title[1:title.rfind('"')] + ':' + title[title.rfind('"')+1:]
+          title = imdbpy_utils.normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
     return title
     
   def parseYear(self):
diff --git a/scrapeit/imdbpy_utils.py b/scrapeit/imdbpy_utils.py
new file mode 100644
index 0000000..87fe513
--- /dev/null
+++ b/scrapeit/imdbpy_utils.py
@@ -0,0 +1,899 @@
+# -*- coding: utf-8 -*-
+# -*- Mode: Python; -*-
+"""
+utils module (imdb package).
+
+This module provides basic utilities for the imdb package.
+
+Copyright 2004-2006 Davide Alberani <da@erlug.linux.it>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+"""
+
+from __future__ import generators
+import re
+from types import UnicodeType, StringType, ListType, TupleType, DictType
+from copy import copy, deepcopy
+from time import strptime, strftime
+
+#from imdb._exceptions import IMDbParserError
+
+# The regular expression for the "long" year format of IMDb, like
+# "(1998)" and "(1986/II)", where the optional roman number (that I call
+# "imdbIndex" after the slash is used for movies with the same title
+# and year of release.
+# XXX: probably L, C, D and M are far too much! ;-)
+re_year_index = re.compile(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
+
+# Match only the imdbIndex (for name strings).
+re_index = re.compile(r'^\(([IVXLCDM]+)\)$')
+
+# Match the number of episodes.
+re_episodes = re.compile('\s?\((\d+) episodes\)', re.I)
+
+re_episode_info = re.compile(r'{(.+?)?\s?(\([0-9\?]{4}-[0-9\?]{1,2}-[0-9\?]{1,2}\))?\s?(\(#[0-9]+\.[0-9]+\))?}')
+
+# Common suffixes in surnames.
+_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
+                    'e', 'von', 'the', 'di', 'du', 'el', 'al')
+
+def canonicalName(name):
+    """Return the given name in canonical "Surname, Name" format.
+    It assumes that name is in the 'Name Surname' format."""
+    # XXX: some statistics (over 1852406 names):
+    #      - just a surname:                 51921
+    #      - single surname, single name:  1792759
+    #      - composed surname, composed name: 7726
+    #      - composed surname, single name:  55623
+    #        (2: 49259, 3: 5502, 4: 551)
+    #      - single surname, composed name: 186604
+    #        (2: 178315, 3: 6573, 4: 1219, 5: 352)
+    # Don't convert names already in the canonical format.
+    if name.find(', ') != -1: return name
+    sname = name.split(' ')
+    snl = len(sname)
+    if snl == 2:
+        # Just a name and a surname: how boring...
+        name = '%s, %s' % (sname[1], sname[0])
+    elif snl > 2:
+        lsname = [x.lower() for x in sname]
+        if snl == 3: _indexes = (0, snl-2)
+        else: _indexes = (0, snl-2, snl-3)
+        # Check for common surname prefixes at the beginning and near the end.
+        for index in _indexes:
+            if lsname[index] not in _sname_suffixes: continue
+            try:
+                # Build the surname.
+                surn = '%s %s' % (sname[index], sname[index+1])
+                del sname[index]
+                del sname[index]
+                try:
+                    # Handle the "Jr." after the name.
+                    if lsname[index+2].startswith('jr'):
+                        surn += ' %s' % sname[index]
+                        del sname[index]
+                except (IndexError, ValueError):
+                    pass
+                name = '%s, %s' % (surn, ' '.join(sname))
+                break
+            except ValueError:
+                continue
+        else:
+            name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
+    return name
+
+def normalizeName(name):
+    """Return a name in the normal "Name Surname" format."""
+    sname = name.split(', ')
+    if len(sname) == 2:
+        name = '%s %s' % (sname[1], sname[0])
+    return name
+
+def analyze_name(name, canonical=0):
+    """Return a dictionary with the name and the optional imdbIndex
+    keys, from the given string.
+    If canonical is true, it tries to convert the  name to
+    the canonical "Surname, Name" format.
+
+    raise an IMDbParserError exception if the name is not valid.
+    """
+    original_n = name
+    name = name.strip()
+    res = {}
+    imdbIndex = ''
+    opi = name.rfind('(')
+    if opi != -1:
+        cpi = name.rfind(')')
+        if cpi > opi and re_index.match(name[opi:cpi+1]):
+            imdbIndex = name[opi+1:cpi]
+            name = name[:opi].rstrip()
+    if not name:
+        raise IMDbParserError, 'invalid name: "%s"' % original_n
+    if canonical:
+        name = canonicalName(name)
+    res['name'] = name
+    if imdbIndex:
+        res['imdbIndex'] = imdbIndex
+    return res
+
+
+def build_name(name_dict, canonical=0):
+    """Given a dictionary that represents a "long" IMDb name,
+    return a string.
+    If canonical is not set, the name is returned in the normal
+    "Name Surname" format.
+    """
+    name = name_dict.get('canonical name') or name_dict.get('name', '')
+    if not name: return u''
+    if not canonical:
+        name = normalizeName(name)
+    imdbIndex = name_dict.get('imdbIndex')
+    if imdbIndex:
+        name += ' (%s)' % imdbIndex
+    return name
+
+
+# List of articles.
+# XXX: Managing titles in a lot of different languages, a function to recognize
+# an initial article can't be perfect; sometimes we'll stumble upon a short
+# word that is an article in some language, but it's not in another; in these
+# situations we have to choose if we want to interpret this little word
+# as an article or not (remember that we don't know what the original language
+# of the title was).
+# Example: 'da' is an article in (I think) Dutch and it's used as an article
+# even in some American slangs.  Unfortunately it's also a preposition in
+# Italian, and it's widely used in Mandarin (for whatever it means!).
+# Running a script over the whole list of titles (and aliases), I've found
+# that 'da' is used as an article only 20 times, and as another thing 255
+# times, so I've decided to _always_ consider 'da' as a non article.
+#
+# Here is a list of words that are _never_ considered as articles, complete
+# with the cound of times they are used in a way or another:
+# 'en' (314 vs 507), 'to' (236 vs 589), 'as' (183 vs 231), 'et' (67 vs 79),
+# 'des' (69 vs 123), 'al' (57 vs 247), 'egy' (28 vs 32), 'ye' (14 vs 55),
+# 'da' (20 vs 255), "'n" (7 vs 12)
+#
+# I've left in the list 'i' (1614 vs 1707) and 'uno' (49 vs 51)
+# I'm not sure what '-al' is, and so I've left it out...
+#
+# List of articles:
+_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
+            "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
+            'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
+            'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
+            'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
+            u'\xd4\xef', u'\xcf\xe9')
+
+# Articles in a dictionary.
+_articlesDict = dict([(x, x) for x in _articles])
+_spArticles = []
+for article in _articles:
+    if article[-1] not in ("'", '-'): article += ' '
+    _spArticles.append(article)
+
+def canonicalTitle(title):
+    """Return the title in the canonic format 'Movie Title, The'."""
+    try:
+        if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
+    except IndexError: pass
+    ltitle = title.lower()
+    for article in _spArticles:
+        if ltitle.startswith(article):
+            lart = len(article)
+            title = '%s, %s' % (title[lart:], title[:lart])
+            if article[-1] == ' ': title = title[:-1]
+            break
+    ## XXX: an attempt using a dictionary lookup.
+    ##for artSeparator in (' ', "'", '-'):
+    ##    article = _articlesDict.get(ltitle.split(artSeparator)[0])
+    ##    if article is not None:
+    ##        lart = len(article)
+    ##        # check titles like "una", "I'm Mad" and "L'abbacchio".
+    ##        if title[lart:] == '' or (artSeparator != ' ' and
+    ##                                title[lart:][1] != artSeparator): continue
+    ##        title = '%s, %s' % (title[lart:], title[:lart])
+    ##        if artSeparator == ' ': title = title[1:]
+    ##        break
+    return title
+
+def normalizeTitle(title):
+    """Return the title in the normal "The Title" format."""
+    stitle = title.split(', ')
+    if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
+        sep = ' '
+        if stitle[-1][-1] in ("'", '-'): sep = ''
+        title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
+    return title
+
+
+def _split_series_episode(title):
+    """Return the series and the episode titles; if this is not a
+    series' episode, the returned series title is empty.
+    This function recognize two different styles:
+        "The Series" An Episode (2005)
+        "The Series" (2004) {An Episode (2005) (#season.episode)}"""
+    series_title = ''
+    episode_or_year = ''
+    if title[-1:] == '}':
+        # Title of the episode, as in the plain text data files.
+        begin_eps = title.rfind('{')
+        if begin_eps == -1: return '', ''
+        series_title = title[:begin_eps].rstrip()
+        # episode_or_year is returned with the {...}
+        episode_or_year = title[begin_eps:]
+        if episode_or_year[:12] == '{SUSPENDED}}': return '', ''
+    # XXX: works only with tv series; it's still unclear whether
+    #      IMDb will support episodes for tv mini series and tv movies...
+    elif title[0:1] == '"':
+        second_quot = title[1:].find('"') + 2
+        if second_quot != 1: # a second " was found.
+            episode_or_year = title[second_quot:].lstrip()
+            first_char = episode_or_year[0:1]
+            if not first_char: return '', ''
+            if first_char != '(':
+                # There is not a (year) but the title of the episode;
+                # that means this is an episode title, as returned by
+                # the web server.
+                series_title = title[:second_quot]
+            ##elif episode_or_year[-1:] == '}':
+            ##        # Title of the episode, as in the plain text data files.
+            ##        begin_eps = episode_or_year.find('{')
+            ##        if begin_eps == -1: return series_title, episode_or_year
+            ##        series_title = title[:second_quot+begin_eps].rstrip()
+            ##        # episode_or_year is returned with the {...}
+            ##        episode_or_year = episode_or_year[begin_eps:]
+    return series_title, episode_or_year
+
+
+def is_series_episode(title):
+    """Return True if 'title' is an series episode."""
+    title = title.strip()
+    if _split_series_episode(title)[0]: return 1
+    return 0
+
+
+def analyze_title(title, canonical=None,
+                    canonicalSeries=0, canonicalEpisode=0):
+    """Analyze the given title and return a dictionary with the
+    "stripped" title, the kind of the show ("movie", "tv series", etc.),
+    the year of production and the optional imdbIndex (a roman number
+    used to distinguish between movies with the same title and year).
+    If canonical is true, the title is converted to the canonical
+    format.
+
+    raise an IMDbParserError exception if the title is not valid.
+    """
+    if canonical is not None:
+        canonicalSeries = canonicalEpisode = canonical
+    original_t = title
+    result = {}
+    title = title.strip()
+    year = ''
+    kind = ''
+    imdbIndex = ''
+    series_title, episode_or_year = _split_series_episode(title)
+    if series_title:
+        # It's an episode of a series.
+        series_d = analyze_title(series_title, canonical=canonicalEpisode)
+        oad = sen = ep_year = ''
+        # Plain text data files format.
+        if episode_or_year[0:1] == '{' and episode_or_year[-1:] == '}':
+            match = re_episode_info.findall(episode_or_year)
+            if match:
+                # Episode title, original air date and #season.episode
+                episode_or_year, oad, sen = match[0]
+                if not oad:
+                    # No year, but the title is something like (2005-04-12)
+                    if episode_or_year and episode_or_year[0] == '(' and \
+                                    episode_or_year[-1:] == ')' and \
+                                    episode_or_year[1:2] != '#':
+                        oad = episode_or_year
+                        if oad[1:5] and oad[5:6] == '-':
+                            ep_year = oad[1:5]
+                if not oad and not sen and episode_or_year.startswith('(#'):
+                    sen = episode_or_year
+        elif episode_or_year.startswith('Episode dated'):
+            oad = episode_or_year[14:]
+            if oad[-4:].isdigit():
+                ep_year = oad[-4:]
+        episode_d = analyze_title(episode_or_year, canonical=canonicalEpisode)
+        episode_d['kind'] = 'episode'
+        episode_d['episode of'] = series_d
+        if oad:
+            episode_d['original air date'] = oad[1:-1]
+            if ep_year and episode_d.get('year') is None:
+                episode_d['year'] = ep_year
+        if sen:
+            seas, epn = sen[2:-1].split('.')
+            if seas:
+                # Set season and episode.
+                try: seas = int(seas)
+                except: pass
+                try: epn = int(epn)
+                except: pass
+                episode_d['season'] = seas
+                episode_d['episode'] = epn
+        return episode_d
+    # First of all, search for the kind of show.
+    # XXX: Number of entries at 18 Mar 2006:
+    #      movie:        344,892
+    #      episode:      272,862
+    #      tv movie:      53,269
+    #      tv series:     37,065
+    #      video movie:   44,062
+    #      tv mini series: 4,757
+    #      video game:     4,472
+    #      More up-to-date statistics: http://us.imdb.com/database_statistics
+    if title.endswith('(TV)'):
+        kind = 'tv movie'
+        title = title[:-4].rstrip()
+    elif title.endswith('(V)'):
+        kind = 'video movie'
+        title = title[:-3].rstrip()
+    elif title.endswith('(mini)'):
+        kind = 'tv mini series'
+        title = title[:-6].rstrip()
+    elif title.endswith('(VG)'):
+        kind = 'video game'
+        title = title[:-4].rstrip()
+    # Search for the year and the optional imdbIndex (a roman number).
+    yi = re_year_index.findall(title)
+    if yi:
+        last_yi = yi[-1]
+        year = last_yi[0]
+        if last_yi[1]:
+            imdbIndex = last_yi[1][1:]
+            year = year[:-len(imdbIndex)-1]
+        i = title.rfind('(%s)' % last_yi[0])
+        if i != -1:
+            title = title[:i-1].rstrip()
+    # This is a tv (mini) series: strip the '"' at the begin and at the end.
+    # XXX: strip('"') is not used for compatibility with Python 2.0.
+    if title and title[0] == title[-1] == '"':
+        if not kind:
+            kind = 'tv series'
+        title = title[1:-1].strip()
+    if not title:
+        raise IMDbParserError, 'invalid title: "%s"' % original_t
+    if canonical:
+        title = canonicalTitle(title)
+    # 'kind' is one in ('movie', 'episode', 'tv series', 'tv mini series',
+    #                   'tv movie', 'video movie', 'video game')
+    result['title'] = title
+    result['kind'] = kind or 'movie'
+    if year and year != '????':
+        result['year'] = str(year)
+    if imdbIndex:
+        result['imdbIndex'] = str(imdbIndex)
+    return result
+
+
+_web_format = '%d %B %Y'
+_ptdf_format = '(%Y-%m-%d)'
+def _convertTime(title, fromPTDFtoWEB=1):
+    """Convert a time expressed in the pain text data files, to
+    the 'Episode dated ...' format used on the web site; if
+    fromPTDFtoWEB is false, the inverted conversion is applied."""
+    try:
+        if fromPTDFtoWEB:
+            from_format = _ptdf_format
+            to_format = _web_format
+        else:
+            from_format = 'Episode dated %s' % _web_format
+            to_format = _ptdf_format
+        t = strptime(title, from_format)
+        title = strftime(to_format, t)
+        if fromPTDFtoWEB:
+            if title[0] == '0': title = title[1:]
+            title = 'Episode dated %s' % title
+    except ValueError:
+        pass
+    return title
+
+
+def build_title(title_dict, canonical=None,
+                canonicalSeries=0, canonicalEpisode=0, ptdf=0, _doYear=1):
+    """Given a dictionary that represents a "long" IMDb title,
+    return a string.
+
+    If canonical is not true, the title is returned in the
+    normal format.
+
+    If ptdf is true, the plain text data files format is used.
+    """
+    if canonical is not None:
+        canonicalSeries = canonical
+    pre_title = ''
+    kind = title_dict.get('kind')
+    episode_of = title_dict.get('episode of')
+    if kind == 'episode' and episode_of is not None:
+        # Works with both Movie instances and plain dictionaries.
+        doYear = 0
+        if ptdf:
+            doYear = 1
+        pre_title = build_title(episode_of, canonical=canonicalSeries,
+                                ptdf=0, _doYear=doYear)
+        ep_dict = {'title': title_dict.get('title', ''),
+                    'imdbIndex': title_dict.get('imdbIndex')}
+        ep_title = ep_dict['title']
+        if not ptdf:
+            doYear = 1
+            ep_dict['year'] = title_dict.get('year') or '????'
+            if ep_title[0:1] == '(' and ep_title[-1:] == ')' and \
+                    ep_title[1:5].isdigit():
+                ep_dict['title'] = _convertTime(ep_title, fromPTDFtoWEB=1)
+        else:
+            doYear = 0
+            if ep_title.startswith('Episode dated'):
+                ep_dict['title'] = _convertTime(ep_title, fromPTDFtoWEB=0)
+        episode_title = build_title(ep_dict,
+                            canonical=canonicalEpisode, ptdf=ptdf,
+                            _doYear=doYear)
+        if ptdf:
+            oad = title_dict.get('original air date', '')
+            if len(oad) == 10 and oad[4] == '-' and oad[7] == '-' and \
+                        episode_title.find(oad) == -1:
+                episode_title += ' (%s)' % oad
+            seas = title_dict.get('season')
+            if seas is not None:
+                episode_title += ' (#%s' % seas
+                episode = title_dict.get('episode')
+                if episode is not None:
+                    episode_title += '.%s' % episode
+                episode_title += ')'
+            episode_title = '{%s}' % episode_title
+        return '%s %s' % (pre_title, episode_title)
+    title = title_dict.get('canonical title') or title_dict.get('title', '')
+    if not title: return u''
+    if not canonical:
+        title = normalizeTitle(title)
+    if pre_title:
+        title = '%s %s' % (pre_title, title)
+    if kind in ('tv series', 'tv mini series'):
+        title = '"%s"' % title
+    if _doYear:
+        imdbIndex = title_dict.get('imdbIndex')
+        year = title_dict.get('year') or '????'
+        title += ' (%s' % year
+        if imdbIndex:
+            title += '/%s' % imdbIndex
+        title += ')'
+    if kind:
+        if kind == 'tv movie':
+            title += ' (TV)'
+        elif kind == 'video movie':
+            title += ' (V)'
+        elif kind == 'tv mini series':
+            title += ' (mini)'
+        elif kind == 'video game':
+            title += ' (VG)'
+    return title
+
+
+class _LastC:
+    """Size matters."""
+    def __cmp__(self, other):
+        if isinstance(other, self.__class__): return 0
+        return 1
+
+_last = _LastC()
+
+def cmpMovies(m1, m2):
+    """Compare two movies by year, in reverse order; the imdbIndex is checked
+    for movies with the same year of production and title."""
+    # Sort tv series' episodes.
+    m1e = m1.get('episode of')
+    m2e = m2.get('episode of')
+    if m1e is not None and m2e is not None:
+        cmp_series = cmpMovies(m1e, m2e)
+        if cmp_series != 0:
+            return cmp_series
+        m1s = m1.get('season')
+        m2s = m2.get('season')
+        if m1s is not None and m2s is not None:
+            if m1s < m2s:
+                return 1
+            elif m1s > m2s:
+                return -1
+            m1p = m1.get('episode')
+            m2p = m2.get('episode')
+            if m1p < m2p:
+                return 1
+            elif m1p > m2p:
+                return -1
+    if m1e is None: m1y = int(m1.get('year', 0))
+    else: m1y = int(m1e.get('year', 0))
+    if m2e is None: m2y = int(m2.get('year', 0))
+    else: m2y = int(m2e.get('year', 0))
+    if m1y > m2y: return -1
+    if m1y < m2y: return 1
+    # Ok, these movies have the same production year...
+    m1t = m1.get('canonical title', _last)
+    m2t = m2.get('canonical title', _last)
+    # It should works also with normal dictionaries (returned from searches).
+    if m1t is _last and m2t is _last:
+        m1t = m1.get('title', _last)
+        m2t = m2.get('title', _last)
+    if m1t < m2t: return -1
+    if m1t > m2t: return 1
+    # Ok, these movies have the same title...
+    m1i = m1.get('imdbIndex', _last)
+    m2i = m2.get('imdbIndex', _last)
+    if m1i > m2i: return -1
+    if m1i < m2i: return 1
+    return 0
+
+
+def cmpPeople(p1, p2):
+    """Compare two people by billingPos, name and imdbIndex."""
+    p1b = p1.billingPos
+    if p1b is None: p1b = _last
+    p2b = p2.billingPos
+    if p2b is None: p2b = _last
+    if p1b > p2b: return 1
+    if p1b < p2b: return -1
+    p1n = p1.get('canonical name', _last)
+    p2n = p2.get('canonical name', _last)
+    if p1n is _last and p2n is _last:
+        p1n = p1.get('name', _last)
+        p2n = p2.get('name', _last)
+    if p1n > p2n: return 1
+    if p1n < p2n: return -1
+    p1i = p1.get('imdbIndex', _last)
+    p2i = p2.get('imdbIndex', _last)
+    if p1i > p2i: return 1
+    if p1i < p2i: return -1
+    return 0
+
+
+# References to titles and names.
+# XXX: find better regexp!
+re_titleRef = re.compile(r'_(.+?(?: \([0-9\?]{4}(?:/[IVXLCDM]+)?\))?(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)_ \(qv\)')
+# FIXME: doesn't match persons with ' in the name.
+re_nameRef = re.compile(r"'([^']+?)' \(qv\)")
+
+# Functions used to filter the text strings.
+def modNull(s, titlesRefs, namesRefs):
+    """Do nothing."""
+    return s
+
+def modClearTitleRefs(s, titlesRefs, namesRefs):
+    """Remove titles references."""
+    return re_titleRef.sub(r'\1', s)
+
+def modClearNameRefs(s, titlesRefs, namesRefs):
+    """Remove names references."""
+    return re_nameRef.sub(r'\1', s)
+
+def modClearRefs(s, titlesRefs, namesRefs):
+    """Remove both titles and names references."""
+    s = modClearTitleRefs(s, {}, {})
+    return modClearNameRefs(s, {}, {})
+
+
+def modifyStrings(o, modFunct, titlesRefs, namesRefs):
+    """Modify a string (or string values in a dictionary or strings
+    in a list), using the provided modFunct function and titlesRefs
+    and namesRefs references dictionaries."""
+    if isinstance(o, (UnicodeType, StringType)):
+        return modFunct(o, titlesRefs, namesRefs)
+    elif isinstance(o, (ListType, TupleType)):
+        _stillorig = 1
+        if isinstance(o, ListType): keys = xrange(len(o))
+        else: keys = o.keys()
+        for i in keys:
+            v = o[i]
+            if isinstance(v, (UnicodeType, StringType)):
+                if _stillorig:
+                    o = copy(o)
+                    _stillorig = 0
+                o[i] = modFunct(v, titlesRefs, namesRefs)
+            elif isinstance(v, (ListType, TupleType)):
+                modifyStrings(o[i], modFunct, titlesRefs, namesRefs)
+    return o
+
+
+def flatten(seq, to_descend=(ListType, DictType, TupleType),
+            yieldDictKeys=0, scalar=None):
+    """Iterate over nested lists and dictionaries; to_descend is a type
+    of a tuple of types to be considered non-scalar; if yieldDictKeys is
+    true, also dictionaries' keys are yielded; if scalar is not None, only
+    items of the given type(s) are yielded."""
+    if not isinstance(seq, to_descend):
+        if scalar is None or isinstance(seq, scalar):
+            yield seq
+    else:
+        if isinstance(seq, DictType):
+            if yieldDictKeys:
+                # Yield also the keys of the dictionary.
+                for key in seq.iterkeys():
+                    for k in flatten(key, to_descend=to_descend,
+                                yieldDictKeys=yieldDictKeys, scalar=scalar):
+                        yield k
+            for value in seq.itervalues():
+                for v in flatten(value, to_descend=to_descend,
+                                yieldDictKeys=yieldDictKeys, scalar=scalar):
+                    yield v
+        else:
+            for item in seq:
+                for i in flatten(item, to_descend=to_descend,
+                                yieldDictKeys=yieldDictKeys, scalar=scalar):
+                    yield i
+
+
+class _Container:
+    """Base class for Movie and Person classes."""
+     # The default sets of information retrieved.
+    default_info = ()
+
+    # Aliases for some not-so-intuitive keys.
+    keys_alias = {}
+
+    # List of keys to modify.
+    keys_tomodify_list = ()
+
+    cmpFunct = None
+
+    def __init__(self, myID=None, data=None, currentRole=u'', notes=u'',
+                accessSystem=None, titlesRefs=None, namesRefs=None,
+                modFunct=None, *args, **kwds):
+        """Initialize a Movie or a Person object.
+        *myID* -- your personal identifier for this object.
+        *data* -- a dictionary used to initialize the object.
+        *currentRole* -- a string representing the current role or duty
+                        of a person in this/a movie.
+        *notes* -- notes for the person referred in the currentRole
+                    attribute; e.g.: '(voice)' or the alias used in the
+                    movie credits.
+        *accessSystem* -- a string representing the data access system used.
+        *titlesRefs* -- a dictionary with references to movies.
+        *namesRefs* -- a dictionary with references to persons.
+        *modFunct* -- function called returning text fields.
+        """
+        self.reset()
+        self.myID = myID
+        if data is None: data = {}
+        self.set_data(data, override=1)
+        self.currentRole = currentRole
+        self.notes = notes
+        self.accessSystem = accessSystem
+        if titlesRefs is None: titlesRefs = {}
+        self.update_titlesRefs(titlesRefs)
+        if namesRefs is None: namesRefs = {}
+        self.update_namesRefs(namesRefs)
+        self.set_mod_funct(modFunct)
+        self.keys_tomodify = {}
+        for item in self.keys_tomodify_list:
+            self.keys_tomodify[item] = None
+        self._init(*args, **kwds)
+
+    def _init(self, **kwds): pass
+
+    def reset(self):
+        """Reset the object."""
+        self.data = {}
+        self.myID = None
+        self.currentRole = u''
+        self.notes = u''
+        self.titlesRefs = {}
+        self.namesRefs = {}
+        self.modFunct = modClearRefs
+        self.current_info = []
+        self._reset()
+
+    def _reset(self): pass
+
+    def clear(self):
+        """Reset the dictionary."""
+        self.data.clear()
+        self.currentRole = u''
+        self.notes = u''
+        self.titlesRefs = {}
+        self.namesRefs = {}
+        self.current_info = []
+        self._clear()
+
+    def _clear(self): pass
+
+    def get_current_info(self):
+        """Return the current set of information retrieved."""
+        return self.current_info
+
+    def set_current_info(self, ci):
+        """Set the current set of information retrieved."""
+        self.current_info = ci
+
+    def add_to_current_info(self, val):
+        """Add a set of information to the current list."""
+        if val not in self.current_info:
+            self.current_info.append(val)
+
+    def has_current_info(self, val):
+        """Return true if the given set of information is in the list."""
+        return val in self.current_info
+
+    def set_mod_funct(self, modFunct):
+        """Set the fuction used to modify the strings."""
+        if modFunct is None: modFunct = modClearRefs
+        self.modFunct = modFunct
+
+    def update_titlesRefs(self, titlesRefs):
+        """Update the dictionary with the references to movies."""
+        self.titlesRefs.update(titlesRefs)
+
+    def get_titlesRefs(self):
+        """Return the dictionary with the references to movies."""
+        return self.titlesRefs
+
+    def update_namesRefs(self, namesRefs):
+        """Update the dictionary with the references to names."""
+        self.namesRefs.update(namesRefs)
+
+    def get_namesRefs(self):
+        """Return the dictionary with the references to names."""
+        return self.namesRefs
+
+    def set_data(self, data, override=0):
+        """Set the movie data to the given dictionary; if 'override' is
+        set, the previous data is removed, otherwise the two dictionary
+        are merged.
+        """
+        if not override:
+            self.data.update(data)
+        else:
+            self.data = data
+
+    def getID(self):
+        """Return movie or person ID."""
+        raise NotImplementedError, 'override this method'
+
+    def __cmp__(self, other):
+        """Compare two Movie or Person objects."""
+        # XXX: raise an exception?
+        if self.cmpFunct is None: return -1
+        if not isinstance(other, self.__class__): return -1
+        return self.cmpFunct(other)
+
+    def __hash__(self):
+        """Hash for this object."""
+        # XXX: does it always work correctly?
+        theID = self.getID()
+        if theID is not None and self.accessSystem not in ('UNKNOWN', None):
+            s4h = '%s:%s' % (self.accessSystem, theID)
+        else:
+            s4h = repr(self)
+        return hash(s4h)
+
+    def isSame(self, other):
+        if not isinstance(other, self.__class__): return 0
+        if hash(self) == hash(other): return 1
+        return 0
+
+    def __len__(self):
+        return len(self.data)
+
+    def _getitem(self, key):
+        """Handle special keys."""
+        return None
+
+    def __getitem__(self, key):
+        """Return the value for a given key, checking key aliases;
+        a KeyError exception is raised if the key is not found.
+        """
+        value = self._getitem(key)
+        if value is not None: return value
+        # Handle key aliases.
+        key = self.keys_alias.get(key, key)
+        rawData = self.data[key]
+        if self.keys_tomodify.has_key(key) and \
+                self.modFunct not in (None, modNull):
+            return modifyStrings(rawData, self.modFunct, self.titlesRefs,
+                                self.namesRefs)
+        return rawData
+
+    def __setitem__(self, key, item):
+        """Directly store the item with the given key."""
+        self.data[key] = item
+
+    def __delitem__(self, key):
+        """Remove the given section or key."""
+        # XXX: how to remove an item of a section?
+        del self.data[key]
+
+    def _additional_keys(self):
+        """Valid keys to append to the data.keys() list."""
+        return []
+
+    def keys(self):
+        """Return a list of valid keys."""
+        return self.data.keys() + self._additional_keys()
+
+    def items(self):
+        """Return the items in the dictionary."""
+        return [(k, self.get(k)) for k in self.keys()]
+
+    # XXX: implement!
+    ##def iteritems(self): return self.data.iteritems()
+    ##def iterkeys(self): return self.data.iterkeys()
+    ##def itervalues(self): return self.data.itervalues()
+
+    def values(self):
+        """Return the values in the dictionary."""
+        return [self.get(k) for k in self.keys()]
+
+    def has_key(self, key):
+        """Return true if a given section is defined."""
+        try:
+            self.__getitem__(key)
+        except KeyError:
+            return 0
+        return 1
+
+    # XXX: really useful???
+    #      consider also that this will confuse people who meant to
+    #      call ia.update(movieObject, 'data set') instead.
+    def update(self, dict):
+        self.data.update(dict)
+
+    def get(self, key, failobj=None):
+        """Return the given section, or default if it's not found."""
+        try:
+            return self.__getitem__(key)
+        except KeyError:
+            return failobj
+
+    def setdefault(self, key, failobj=None):
+        if not self.has_key(key):
+            self[key] = failobj
+        return self[key]
+
+    def pop(self, key, *args):
+        return self.data.pop(key, *args)
+
+    def popitem(self):
+        return self.data.popitem()
+
+    def __repr__(self):
+        """String representation of an object."""
+        raise NotImplementedError, 'override this method'
+
+    def __str__(self):
+        """Movie title or person name."""
+        raise NotImplementedError, 'override this method'
+
+    def __contains__(self, key):
+        raise NotImplementedError, 'override this method'
+
+    def append_item(self, key, item):
+        """The item is appended to the list identified by the given key."""
+        self.data.setdefault(key, []).append(item)
+
+    def set_item(self, key, item):
+        """Directly store the item with the given key."""
+        self.data[key] = item
+
+    def __nonzero__(self):
+        """Return true if self.data contains something."""
+        if self.data: return 1
+        return 0
+
+    def __deepcopy__(self, memo):
+        raise NotImplementedError, 'override this method'
+
+    def copy(self):
+        """Return a deep copy of the object itself."""
+        return deepcopy(self)
+
+