From 50e9f3e8191ead815a03ce1e9e9a77df59e3a184 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Fri, 10 Aug 2007 12:19:11 +0000 Subject: [PATCH] normalizeTitle --- scrapeit/imdb.py | 9 +- scrapeit/imdbpy_utils.py | 899 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 904 insertions(+), 4 deletions(-) create mode 100644 scrapeit/imdbpy_utils.py diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index fbd023a..9a5a7b0 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -15,6 +15,7 @@ from utils import stripTags, htmldecode import utils import chardet +import imdbpy_utils cache_base = "/var/cache/scrapeit/cache/" @@ -183,18 +184,18 @@ class IMDb: title = title.replace(t, '') if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')] - title = title.strip() + title = imdbpy_utils.normalizeTitle(title.strip()) if title.startswith('"') and title.endswith('"'): - title = title[1:-1] + title = imdbpy_utils.normalizeTitle(title[1:-1]) elif title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) if se: se = se[0] se = ' (S%02dE%02d)' % (int(se[0]), int(se[1])) - title = title[1:title.rfind('"')] + se + title[title.rfind('"')+1:] + title = imdbpy_utils.normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] else: - title = title[1:title.rfind('"')] + ':' + title[title.rfind('"')+1:] + title = imdbpy_utils.normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] return title def parseYear(self): diff --git a/scrapeit/imdbpy_utils.py b/scrapeit/imdbpy_utils.py new file mode 100644 index 0000000..87fe513 --- /dev/null +++ b/scrapeit/imdbpy_utils.py @@ -0,0 +1,899 @@ +# -*- coding: utf-8 -*- +# -*- Mode: Python; -*- +""" +utils module (imdb package). + +This module provides basic utilities for the imdb package. + +Copyright 2004-2006 Davide Alberani + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +""" + +from __future__ import generators +import re +from types import UnicodeType, StringType, ListType, TupleType, DictType +from copy import copy, deepcopy +from time import strptime, strftime + +#from imdb._exceptions import IMDbParserError + +# The regular expression for the "long" year format of IMDb, like +# "(1998)" and "(1986/II)", where the optional roman number (that I call +# "imdbIndex" after the slash is used for movies with the same title +# and year of release. +# XXX: probably L, C, D and M are far too much! ;-) +re_year_index = re.compile(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)') + +# Match only the imdbIndex (for name strings). +re_index = re.compile(r'^\(([IVXLCDM]+)\)$') + +# Match the number of episodes. +re_episodes = re.compile('\s?\((\d+) episodes\)', re.I) + +re_episode_info = re.compile(r'{(.+?)?\s?(\([0-9\?]{4}-[0-9\?]{1,2}-[0-9\?]{1,2}\))?\s?(\(#[0-9]+\.[0-9]+\))?}') + +# Common suffixes in surnames. +_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van', + 'e', 'von', 'the', 'di', 'du', 'el', 'al') + +def canonicalName(name): + """Return the given name in canonical "Surname, Name" format. + It assumes that name is in the 'Name Surname' format.""" + # XXX: some statistics (over 1852406 names): + # - just a surname: 51921 + # - single surname, single name: 1792759 + # - composed surname, composed name: 7726 + # - composed surname, single name: 55623 + # (2: 49259, 3: 5502, 4: 551) + # - single surname, composed name: 186604 + # (2: 178315, 3: 6573, 4: 1219, 5: 352) + # Don't convert names already in the canonical format. + if name.find(', ') != -1: return name + sname = name.split(' ') + snl = len(sname) + if snl == 2: + # Just a name and a surname: how boring... + name = '%s, %s' % (sname[1], sname[0]) + elif snl > 2: + lsname = [x.lower() for x in sname] + if snl == 3: _indexes = (0, snl-2) + else: _indexes = (0, snl-2, snl-3) + # Check for common surname prefixes at the beginning and near the end. + for index in _indexes: + if lsname[index] not in _sname_suffixes: continue + try: + # Build the surname. + surn = '%s %s' % (sname[index], sname[index+1]) + del sname[index] + del sname[index] + try: + # Handle the "Jr." after the name. + if lsname[index+2].startswith('jr'): + surn += ' %s' % sname[index] + del sname[index] + except (IndexError, ValueError): + pass + name = '%s, %s' % (surn, ' '.join(sname)) + break + except ValueError: + continue + else: + name = '%s, %s' % (sname[-1], ' '.join(sname[:-1])) + return name + +def normalizeName(name): + """Return a name in the normal "Name Surname" format.""" + sname = name.split(', ') + if len(sname) == 2: + name = '%s %s' % (sname[1], sname[0]) + return name + +def analyze_name(name, canonical=0): + """Return a dictionary with the name and the optional imdbIndex + keys, from the given string. + If canonical is true, it tries to convert the name to + the canonical "Surname, Name" format. + + raise an IMDbParserError exception if the name is not valid. + """ + original_n = name + name = name.strip() + res = {} + imdbIndex = '' + opi = name.rfind('(') + if opi != -1: + cpi = name.rfind(')') + if cpi > opi and re_index.match(name[opi:cpi+1]): + imdbIndex = name[opi+1:cpi] + name = name[:opi].rstrip() + if not name: + raise IMDbParserError, 'invalid name: "%s"' % original_n + if canonical: + name = canonicalName(name) + res['name'] = name + if imdbIndex: + res['imdbIndex'] = imdbIndex + return res + + +def build_name(name_dict, canonical=0): + """Given a dictionary that represents a "long" IMDb name, + return a string. + If canonical is not set, the name is returned in the normal + "Name Surname" format. + """ + name = name_dict.get('canonical name') or name_dict.get('name', '') + if not name: return u'' + if not canonical: + name = normalizeName(name) + imdbIndex = name_dict.get('imdbIndex') + if imdbIndex: + name += ' (%s)' % imdbIndex + return name + + +# List of articles. +# XXX: Managing titles in a lot of different languages, a function to recognize +# an initial article can't be perfect; sometimes we'll stumble upon a short +# word that is an article in some language, but it's not in another; in these +# situations we have to choose if we want to interpret this little word +# as an article or not (remember that we don't know what the original language +# of the title was). +# Example: 'da' is an article in (I think) Dutch and it's used as an article +# even in some American slangs. Unfortunately it's also a preposition in +# Italian, and it's widely used in Mandarin (for whatever it means!). +# Running a script over the whole list of titles (and aliases), I've found +# that 'da' is used as an article only 20 times, and as another thing 255 +# times, so I've decided to _always_ consider 'da' as a non article. +# +# Here is a list of words that are _never_ considered as articles, complete +# with the cound of times they are used in a way or another: +# 'en' (314 vs 507), 'to' (236 vs 589), 'as' (183 vs 231), 'et' (67 vs 79), +# 'des' (69 vs 123), 'al' (57 vs 247), 'egy' (28 vs 32), 'ye' (14 vs 55), +# 'da' (20 vs 255), "'n" (7 vs 12) +# +# I've left in the list 'i' (1614 vs 1707) and 'uno' (49 vs 51) +# I'm not sure what '-al' is, and so I've left it out... +# +# List of articles: +_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el', + "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de', + 'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo', + 'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem', + 'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els', + u'\xd4\xef', u'\xcf\xe9') + +# Articles in a dictionary. +_articlesDict = dict([(x, x) for x in _articles]) +_spArticles = [] +for article in _articles: + if article[-1] not in ("'", '-'): article += ' ' + _spArticles.append(article) + +def canonicalTitle(title): + """Return the title in the canonic format 'Movie Title, The'.""" + try: + if _articlesDict.has_key(title.split(', ')[-1].lower()): return title + except IndexError: pass + ltitle = title.lower() + for article in _spArticles: + if ltitle.startswith(article): + lart = len(article) + title = '%s, %s' % (title[lart:], title[:lart]) + if article[-1] == ' ': title = title[:-1] + break + ## XXX: an attempt using a dictionary lookup. + ##for artSeparator in (' ', "'", '-'): + ## article = _articlesDict.get(ltitle.split(artSeparator)[0]) + ## if article is not None: + ## lart = len(article) + ## # check titles like "una", "I'm Mad" and "L'abbacchio". + ## if title[lart:] == '' or (artSeparator != ' ' and + ## title[lart:][1] != artSeparator): continue + ## title = '%s, %s' % (title[lart:], title[:lart]) + ## if artSeparator == ' ': title = title[1:] + ## break + return title + +def normalizeTitle(title): + """Return the title in the normal "The Title" format.""" + stitle = title.split(', ') + if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()): + sep = ' ' + if stitle[-1][-1] in ("'", '-'): sep = '' + title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1])) + return title + + +def _split_series_episode(title): + """Return the series and the episode titles; if this is not a + series' episode, the returned series title is empty. + This function recognize two different styles: + "The Series" An Episode (2005) + "The Series" (2004) {An Episode (2005) (#season.episode)}""" + series_title = '' + episode_or_year = '' + if title[-1:] == '}': + # Title of the episode, as in the plain text data files. + begin_eps = title.rfind('{') + if begin_eps == -1: return '', '' + series_title = title[:begin_eps].rstrip() + # episode_or_year is returned with the {...} + episode_or_year = title[begin_eps:] + if episode_or_year[:12] == '{SUSPENDED}}': return '', '' + # XXX: works only with tv series; it's still unclear whether + # IMDb will support episodes for tv mini series and tv movies... + elif title[0:1] == '"': + second_quot = title[1:].find('"') + 2 + if second_quot != 1: # a second " was found. + episode_or_year = title[second_quot:].lstrip() + first_char = episode_or_year[0:1] + if not first_char: return '', '' + if first_char != '(': + # There is not a (year) but the title of the episode; + # that means this is an episode title, as returned by + # the web server. + series_title = title[:second_quot] + ##elif episode_or_year[-1:] == '}': + ## # Title of the episode, as in the plain text data files. + ## begin_eps = episode_or_year.find('{') + ## if begin_eps == -1: return series_title, episode_or_year + ## series_title = title[:second_quot+begin_eps].rstrip() + ## # episode_or_year is returned with the {...} + ## episode_or_year = episode_or_year[begin_eps:] + return series_title, episode_or_year + + +def is_series_episode(title): + """Return True if 'title' is an series episode.""" + title = title.strip() + if _split_series_episode(title)[0]: return 1 + return 0 + + +def analyze_title(title, canonical=None, + canonicalSeries=0, canonicalEpisode=0): + """Analyze the given title and return a dictionary with the + "stripped" title, the kind of the show ("movie", "tv series", etc.), + the year of production and the optional imdbIndex (a roman number + used to distinguish between movies with the same title and year). + If canonical is true, the title is converted to the canonical + format. + + raise an IMDbParserError exception if the title is not valid. + """ + if canonical is not None: + canonicalSeries = canonicalEpisode = canonical + original_t = title + result = {} + title = title.strip() + year = '' + kind = '' + imdbIndex = '' + series_title, episode_or_year = _split_series_episode(title) + if series_title: + # It's an episode of a series. + series_d = analyze_title(series_title, canonical=canonicalEpisode) + oad = sen = ep_year = '' + # Plain text data files format. + if episode_or_year[0:1] == '{' and episode_or_year[-1:] == '}': + match = re_episode_info.findall(episode_or_year) + if match: + # Episode title, original air date and #season.episode + episode_or_year, oad, sen = match[0] + if not oad: + # No year, but the title is something like (2005-04-12) + if episode_or_year and episode_or_year[0] == '(' and \ + episode_or_year[-1:] == ')' and \ + episode_or_year[1:2] != '#': + oad = episode_or_year + if oad[1:5] and oad[5:6] == '-': + ep_year = oad[1:5] + if not oad and not sen and episode_or_year.startswith('(#'): + sen = episode_or_year + elif episode_or_year.startswith('Episode dated'): + oad = episode_or_year[14:] + if oad[-4:].isdigit(): + ep_year = oad[-4:] + episode_d = analyze_title(episode_or_year, canonical=canonicalEpisode) + episode_d['kind'] = 'episode' + episode_d['episode of'] = series_d + if oad: + episode_d['original air date'] = oad[1:-1] + if ep_year and episode_d.get('year') is None: + episode_d['year'] = ep_year + if sen: + seas, epn = sen[2:-1].split('.') + if seas: + # Set season and episode. + try: seas = int(seas) + except: pass + try: epn = int(epn) + except: pass + episode_d['season'] = seas + episode_d['episode'] = epn + return episode_d + # First of all, search for the kind of show. + # XXX: Number of entries at 18 Mar 2006: + # movie: 344,892 + # episode: 272,862 + # tv movie: 53,269 + # tv series: 37,065 + # video movie: 44,062 + # tv mini series: 4,757 + # video game: 4,472 + # More up-to-date statistics: http://us.imdb.com/database_statistics + if title.endswith('(TV)'): + kind = 'tv movie' + title = title[:-4].rstrip() + elif title.endswith('(V)'): + kind = 'video movie' + title = title[:-3].rstrip() + elif title.endswith('(mini)'): + kind = 'tv mini series' + title = title[:-6].rstrip() + elif title.endswith('(VG)'): + kind = 'video game' + title = title[:-4].rstrip() + # Search for the year and the optional imdbIndex (a roman number). + yi = re_year_index.findall(title) + if yi: + last_yi = yi[-1] + year = last_yi[0] + if last_yi[1]: + imdbIndex = last_yi[1][1:] + year = year[:-len(imdbIndex)-1] + i = title.rfind('(%s)' % last_yi[0]) + if i != -1: + title = title[:i-1].rstrip() + # This is a tv (mini) series: strip the '"' at the begin and at the end. + # XXX: strip('"') is not used for compatibility with Python 2.0. + if title and title[0] == title[-1] == '"': + if not kind: + kind = 'tv series' + title = title[1:-1].strip() + if not title: + raise IMDbParserError, 'invalid title: "%s"' % original_t + if canonical: + title = canonicalTitle(title) + # 'kind' is one in ('movie', 'episode', 'tv series', 'tv mini series', + # 'tv movie', 'video movie', 'video game') + result['title'] = title + result['kind'] = kind or 'movie' + if year and year != '????': + result['year'] = str(year) + if imdbIndex: + result['imdbIndex'] = str(imdbIndex) + return result + + +_web_format = '%d %B %Y' +_ptdf_format = '(%Y-%m-%d)' +def _convertTime(title, fromPTDFtoWEB=1): + """Convert a time expressed in the pain text data files, to + the 'Episode dated ...' format used on the web site; if + fromPTDFtoWEB is false, the inverted conversion is applied.""" + try: + if fromPTDFtoWEB: + from_format = _ptdf_format + to_format = _web_format + else: + from_format = 'Episode dated %s' % _web_format + to_format = _ptdf_format + t = strptime(title, from_format) + title = strftime(to_format, t) + if fromPTDFtoWEB: + if title[0] == '0': title = title[1:] + title = 'Episode dated %s' % title + except ValueError: + pass + return title + + +def build_title(title_dict, canonical=None, + canonicalSeries=0, canonicalEpisode=0, ptdf=0, _doYear=1): + """Given a dictionary that represents a "long" IMDb title, + return a string. + + If canonical is not true, the title is returned in the + normal format. + + If ptdf is true, the plain text data files format is used. + """ + if canonical is not None: + canonicalSeries = canonical + pre_title = '' + kind = title_dict.get('kind') + episode_of = title_dict.get('episode of') + if kind == 'episode' and episode_of is not None: + # Works with both Movie instances and plain dictionaries. + doYear = 0 + if ptdf: + doYear = 1 + pre_title = build_title(episode_of, canonical=canonicalSeries, + ptdf=0, _doYear=doYear) + ep_dict = {'title': title_dict.get('title', ''), + 'imdbIndex': title_dict.get('imdbIndex')} + ep_title = ep_dict['title'] + if not ptdf: + doYear = 1 + ep_dict['year'] = title_dict.get('year') or '????' + if ep_title[0:1] == '(' and ep_title[-1:] == ')' and \ + ep_title[1:5].isdigit(): + ep_dict['title'] = _convertTime(ep_title, fromPTDFtoWEB=1) + else: + doYear = 0 + if ep_title.startswith('Episode dated'): + ep_dict['title'] = _convertTime(ep_title, fromPTDFtoWEB=0) + episode_title = build_title(ep_dict, + canonical=canonicalEpisode, ptdf=ptdf, + _doYear=doYear) + if ptdf: + oad = title_dict.get('original air date', '') + if len(oad) == 10 and oad[4] == '-' and oad[7] == '-' and \ + episode_title.find(oad) == -1: + episode_title += ' (%s)' % oad + seas = title_dict.get('season') + if seas is not None: + episode_title += ' (#%s' % seas + episode = title_dict.get('episode') + if episode is not None: + episode_title += '.%s' % episode + episode_title += ')' + episode_title = '{%s}' % episode_title + return '%s %s' % (pre_title, episode_title) + title = title_dict.get('canonical title') or title_dict.get('title', '') + if not title: return u'' + if not canonical: + title = normalizeTitle(title) + if pre_title: + title = '%s %s' % (pre_title, title) + if kind in ('tv series', 'tv mini series'): + title = '"%s"' % title + if _doYear: + imdbIndex = title_dict.get('imdbIndex') + year = title_dict.get('year') or '????' + title += ' (%s' % year + if imdbIndex: + title += '/%s' % imdbIndex + title += ')' + if kind: + if kind == 'tv movie': + title += ' (TV)' + elif kind == 'video movie': + title += ' (V)' + elif kind == 'tv mini series': + title += ' (mini)' + elif kind == 'video game': + title += ' (VG)' + return title + + +class _LastC: + """Size matters.""" + def __cmp__(self, other): + if isinstance(other, self.__class__): return 0 + return 1 + +_last = _LastC() + +def cmpMovies(m1, m2): + """Compare two movies by year, in reverse order; the imdbIndex is checked + for movies with the same year of production and title.""" + # Sort tv series' episodes. + m1e = m1.get('episode of') + m2e = m2.get('episode of') + if m1e is not None and m2e is not None: + cmp_series = cmpMovies(m1e, m2e) + if cmp_series != 0: + return cmp_series + m1s = m1.get('season') + m2s = m2.get('season') + if m1s is not None and m2s is not None: + if m1s < m2s: + return 1 + elif m1s > m2s: + return -1 + m1p = m1.get('episode') + m2p = m2.get('episode') + if m1p < m2p: + return 1 + elif m1p > m2p: + return -1 + if m1e is None: m1y = int(m1.get('year', 0)) + else: m1y = int(m1e.get('year', 0)) + if m2e is None: m2y = int(m2.get('year', 0)) + else: m2y = int(m2e.get('year', 0)) + if m1y > m2y: return -1 + if m1y < m2y: return 1 + # Ok, these movies have the same production year... + m1t = m1.get('canonical title', _last) + m2t = m2.get('canonical title', _last) + # It should works also with normal dictionaries (returned from searches). + if m1t is _last and m2t is _last: + m1t = m1.get('title', _last) + m2t = m2.get('title', _last) + if m1t < m2t: return -1 + if m1t > m2t: return 1 + # Ok, these movies have the same title... + m1i = m1.get('imdbIndex', _last) + m2i = m2.get('imdbIndex', _last) + if m1i > m2i: return -1 + if m1i < m2i: return 1 + return 0 + + +def cmpPeople(p1, p2): + """Compare two people by billingPos, name and imdbIndex.""" + p1b = p1.billingPos + if p1b is None: p1b = _last + p2b = p2.billingPos + if p2b is None: p2b = _last + if p1b > p2b: return 1 + if p1b < p2b: return -1 + p1n = p1.get('canonical name', _last) + p2n = p2.get('canonical name', _last) + if p1n is _last and p2n is _last: + p1n = p1.get('name', _last) + p2n = p2.get('name', _last) + if p1n > p2n: return 1 + if p1n < p2n: return -1 + p1i = p1.get('imdbIndex', _last) + p2i = p2.get('imdbIndex', _last) + if p1i > p2i: return 1 + if p1i < p2i: return -1 + return 0 + + +# References to titles and names. +# XXX: find better regexp! +re_titleRef = re.compile(r'_(.+?(?: \([0-9\?]{4}(?:/[IVXLCDM]+)?\))?(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)_ \(qv\)') +# FIXME: doesn't match persons with ' in the name. +re_nameRef = re.compile(r"'([^']+?)' \(qv\)") + +# Functions used to filter the text strings. +def modNull(s, titlesRefs, namesRefs): + """Do nothing.""" + return s + +def modClearTitleRefs(s, titlesRefs, namesRefs): + """Remove titles references.""" + return re_titleRef.sub(r'\1', s) + +def modClearNameRefs(s, titlesRefs, namesRefs): + """Remove names references.""" + return re_nameRef.sub(r'\1', s) + +def modClearRefs(s, titlesRefs, namesRefs): + """Remove both titles and names references.""" + s = modClearTitleRefs(s, {}, {}) + return modClearNameRefs(s, {}, {}) + + +def modifyStrings(o, modFunct, titlesRefs, namesRefs): + """Modify a string (or string values in a dictionary or strings + in a list), using the provided modFunct function and titlesRefs + and namesRefs references dictionaries.""" + if isinstance(o, (UnicodeType, StringType)): + return modFunct(o, titlesRefs, namesRefs) + elif isinstance(o, (ListType, TupleType)): + _stillorig = 1 + if isinstance(o, ListType): keys = xrange(len(o)) + else: keys = o.keys() + for i in keys: + v = o[i] + if isinstance(v, (UnicodeType, StringType)): + if _stillorig: + o = copy(o) + _stillorig = 0 + o[i] = modFunct(v, titlesRefs, namesRefs) + elif isinstance(v, (ListType, TupleType)): + modifyStrings(o[i], modFunct, titlesRefs, namesRefs) + return o + + +def flatten(seq, to_descend=(ListType, DictType, TupleType), + yieldDictKeys=0, scalar=None): + """Iterate over nested lists and dictionaries; to_descend is a type + of a tuple of types to be considered non-scalar; if yieldDictKeys is + true, also dictionaries' keys are yielded; if scalar is not None, only + items of the given type(s) are yielded.""" + if not isinstance(seq, to_descend): + if scalar is None or isinstance(seq, scalar): + yield seq + else: + if isinstance(seq, DictType): + if yieldDictKeys: + # Yield also the keys of the dictionary. + for key in seq.iterkeys(): + for k in flatten(key, to_descend=to_descend, + yieldDictKeys=yieldDictKeys, scalar=scalar): + yield k + for value in seq.itervalues(): + for v in flatten(value, to_descend=to_descend, + yieldDictKeys=yieldDictKeys, scalar=scalar): + yield v + else: + for item in seq: + for i in flatten(item, to_descend=to_descend, + yieldDictKeys=yieldDictKeys, scalar=scalar): + yield i + + +class _Container: + """Base class for Movie and Person classes.""" + # The default sets of information retrieved. + default_info = () + + # Aliases for some not-so-intuitive keys. + keys_alias = {} + + # List of keys to modify. + keys_tomodify_list = () + + cmpFunct = None + + def __init__(self, myID=None, data=None, currentRole=u'', notes=u'', + accessSystem=None, titlesRefs=None, namesRefs=None, + modFunct=None, *args, **kwds): + """Initialize a Movie or a Person object. + *myID* -- your personal identifier for this object. + *data* -- a dictionary used to initialize the object. + *currentRole* -- a string representing the current role or duty + of a person in this/a movie. + *notes* -- notes for the person referred in the currentRole + attribute; e.g.: '(voice)' or the alias used in the + movie credits. + *accessSystem* -- a string representing the data access system used. + *titlesRefs* -- a dictionary with references to movies. + *namesRefs* -- a dictionary with references to persons. + *modFunct* -- function called returning text fields. + """ + self.reset() + self.myID = myID + if data is None: data = {} + self.set_data(data, override=1) + self.currentRole = currentRole + self.notes = notes + self.accessSystem = accessSystem + if titlesRefs is None: titlesRefs = {} + self.update_titlesRefs(titlesRefs) + if namesRefs is None: namesRefs = {} + self.update_namesRefs(namesRefs) + self.set_mod_funct(modFunct) + self.keys_tomodify = {} + for item in self.keys_tomodify_list: + self.keys_tomodify[item] = None + self._init(*args, **kwds) + + def _init(self, **kwds): pass + + def reset(self): + """Reset the object.""" + self.data = {} + self.myID = None + self.currentRole = u'' + self.notes = u'' + self.titlesRefs = {} + self.namesRefs = {} + self.modFunct = modClearRefs + self.current_info = [] + self._reset() + + def _reset(self): pass + + def clear(self): + """Reset the dictionary.""" + self.data.clear() + self.currentRole = u'' + self.notes = u'' + self.titlesRefs = {} + self.namesRefs = {} + self.current_info = [] + self._clear() + + def _clear(self): pass + + def get_current_info(self): + """Return the current set of information retrieved.""" + return self.current_info + + def set_current_info(self, ci): + """Set the current set of information retrieved.""" + self.current_info = ci + + def add_to_current_info(self, val): + """Add a set of information to the current list.""" + if val not in self.current_info: + self.current_info.append(val) + + def has_current_info(self, val): + """Return true if the given set of information is in the list.""" + return val in self.current_info + + def set_mod_funct(self, modFunct): + """Set the fuction used to modify the strings.""" + if modFunct is None: modFunct = modClearRefs + self.modFunct = modFunct + + def update_titlesRefs(self, titlesRefs): + """Update the dictionary with the references to movies.""" + self.titlesRefs.update(titlesRefs) + + def get_titlesRefs(self): + """Return the dictionary with the references to movies.""" + return self.titlesRefs + + def update_namesRefs(self, namesRefs): + """Update the dictionary with the references to names.""" + self.namesRefs.update(namesRefs) + + def get_namesRefs(self): + """Return the dictionary with the references to names.""" + return self.namesRefs + + def set_data(self, data, override=0): + """Set the movie data to the given dictionary; if 'override' is + set, the previous data is removed, otherwise the two dictionary + are merged. + """ + if not override: + self.data.update(data) + else: + self.data = data + + def getID(self): + """Return movie or person ID.""" + raise NotImplementedError, 'override this method' + + def __cmp__(self, other): + """Compare two Movie or Person objects.""" + # XXX: raise an exception? + if self.cmpFunct is None: return -1 + if not isinstance(other, self.__class__): return -1 + return self.cmpFunct(other) + + def __hash__(self): + """Hash for this object.""" + # XXX: does it always work correctly? + theID = self.getID() + if theID is not None and self.accessSystem not in ('UNKNOWN', None): + s4h = '%s:%s' % (self.accessSystem, theID) + else: + s4h = repr(self) + return hash(s4h) + + def isSame(self, other): + if not isinstance(other, self.__class__): return 0 + if hash(self) == hash(other): return 1 + return 0 + + def __len__(self): + return len(self.data) + + def _getitem(self, key): + """Handle special keys.""" + return None + + def __getitem__(self, key): + """Return the value for a given key, checking key aliases; + a KeyError exception is raised if the key is not found. + """ + value = self._getitem(key) + if value is not None: return value + # Handle key aliases. + key = self.keys_alias.get(key, key) + rawData = self.data[key] + if self.keys_tomodify.has_key(key) and \ + self.modFunct not in (None, modNull): + return modifyStrings(rawData, self.modFunct, self.titlesRefs, + self.namesRefs) + return rawData + + def __setitem__(self, key, item): + """Directly store the item with the given key.""" + self.data[key] = item + + def __delitem__(self, key): + """Remove the given section or key.""" + # XXX: how to remove an item of a section? + del self.data[key] + + def _additional_keys(self): + """Valid keys to append to the data.keys() list.""" + return [] + + def keys(self): + """Return a list of valid keys.""" + return self.data.keys() + self._additional_keys() + + def items(self): + """Return the items in the dictionary.""" + return [(k, self.get(k)) for k in self.keys()] + + # XXX: implement! + ##def iteritems(self): return self.data.iteritems() + ##def iterkeys(self): return self.data.iterkeys() + ##def itervalues(self): return self.data.itervalues() + + def values(self): + """Return the values in the dictionary.""" + return [self.get(k) for k in self.keys()] + + def has_key(self, key): + """Return true if a given section is defined.""" + try: + self.__getitem__(key) + except KeyError: + return 0 + return 1 + + # XXX: really useful??? + # consider also that this will confuse people who meant to + # call ia.update(movieObject, 'data set') instead. + def update(self, dict): + self.data.update(dict) + + def get(self, key, failobj=None): + """Return the given section, or default if it's not found.""" + try: + return self.__getitem__(key) + except KeyError: + return failobj + + def setdefault(self, key, failobj=None): + if not self.has_key(key): + self[key] = failobj + return self[key] + + def pop(self, key, *args): + return self.data.pop(key, *args) + + def popitem(self): + return self.data.popitem() + + def __repr__(self): + """String representation of an object.""" + raise NotImplementedError, 'override this method' + + def __str__(self): + """Movie title or person name.""" + raise NotImplementedError, 'override this method' + + def __contains__(self, key): + raise NotImplementedError, 'override this method' + + def append_item(self, key, item): + """The item is appended to the list identified by the given key.""" + self.data.setdefault(key, []).append(item) + + def set_item(self, key, item): + """Directly store the item with the given key.""" + self.data[key] = item + + def __nonzero__(self): + """Return true if self.data contains something.""" + if self.data: return 1 + return 0 + + def __deepcopy__(self, memo): + raise NotImplementedError, 'override this method' + + def copy(self): + """Return a deep copy of the object itself.""" + return deepcopy(self) + +