diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 0bf1f2d..ff02ef9 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -7,12 +7,13 @@ import os import time import ox -from ox import findRe +from ox import findRe, stripTags from ox.normalize import normalizeTitle, normalizeImdbId from siteparser import SiteParser import google + class Imdb(SiteParser): regex = { 'alternative_titles': { @@ -25,13 +26,17 @@ class Imdb(SiteParser): }, 'cast': { - 'page': 'combined', - 're': '.*?>(.*?).*?(.*?)', + 'page': 'fullcredits', + 're': [ + '.*?>(.*?).*?(.*?)', + lambda ll: [stripTags(l) for l in ll] + ], 'type': 'list' }, 'cinematographers': { - 'page': 'combined', + 'page': 'fullcredits', 're': [ + lambda data: data.split('Series Crew')[0], 'Cinematography by(.*?)', '(.*?)' ], @@ -48,16 +53,18 @@ class Imdb(SiteParser): 'type': 'list' }, 'directors': { - 'page': 'combined', + 'page': 'fullcredits', 're': [ + lambda data: data.split('Series Crew')[0], 'Directed by(.*?)', '(.*?)' ], 'type': 'list' }, 'editors': { - 'page': 'combined', + 'page': 'fullcredits', 're': [ + lambda data: data.split('Series Crew')[0], 'Film Editing by(.*?)', '(.*?)' ], @@ -108,7 +115,7 @@ class Imdb(SiteParser): }, 'rating': { 'page': 'combined', - 're': '
.*?(.*?)/10', + 're': '
.*?([\d,.]?)/10', 'type': 'float' }, 'release_date': { @@ -141,12 +148,13 @@ class Imdb(SiteParser): }, 'votes': { 'page': 'combined', - 're': '(.*?) votes', + 're': '([\d,]*?) votes', 'type': 'string' }, 'writers': { - 'page': 'combined', + 'page': 'fullcredits', 're': [ + lambda data: data.split('Series Crew')[0], 'Writing credits(.*?)', '(.*?)' ], @@ -167,8 +175,8 @@ class Imdb(SiteParser): if 'min' in self['runtime']: base=60 else: base=1 self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base - else: - self['runtime'] = 0 + if 'runtime' in self and not self['runtime']: + del self['runtime'] if 'votes' in self: self['votes'] = self['votes'].replace(',', '') if 'connections' in self: cc={} @@ -179,7 +187,8 @@ class Imdb(SiteParser): self['connections'] = cc for key in ('countries', 'genres'): - self[key] = filter(lambda x: x.lower() != 'home', self[key]) + if key in self: + self[key] = filter(lambda x: x.lower() != 'home', self[key]) def guess(title, director='', timeout=google.DEFAULT_TIMEOUT): @@ -192,7 +201,7 @@ def guess(title, director='', timeout=google.DEFAULT_TIMEOUT): return_url = '' #lest first try google - #i.e. site:imdb.com Michael Stevens Sin + #i.e. site:imdb.com Michael Stevens "Sin" if director: search = 'site:imdb.com %s "%s"' % (director, title) else: diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index d9fa1dd..2fa4332 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -15,7 +15,7 @@ def cleanup(key, data, data_type): data = [decodeHtml(p).strip() for p in data] elif isinstance(data[0], list) or isinstance(data[0], tuple): data = [cleanup(key, p, data_type) for p in data] - while len(data) == 1: + while len(data) == 1 and not isinstance(data, basestring): data = data[0] if data_type == 'list' and isinstance(data, basestring): data = [data, ] @@ -37,12 +37,18 @@ class SiteParser(dict): if isinstance(self.regex[key]['re'], basestring): data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data) data = cleanup(key, data, self.regex[key]['type']) + elif callable(self.regex[key]['re']): + data = self.regex[key]['re'](data) else: for r in self.regex[key]['re']: - if isinstance(data, basestring): - data = re.compile(r, re.DOTALL).findall(data) + if callable(r): + f = r else: - data = [re.compile(r, re.DOTALL).findall(d) for d in data] + f = re.compile(r, re.DOTALL).findall + if isinstance(data, basestring): + data = f(data) + else: + data = [f(d) for d in data] data = cleanup(key, data, self.regex[key]['type']) def apply_f(f, data): if data and isinstance(data[0], list): @@ -50,12 +56,13 @@ class SiteParser(dict): else: data = f(data) return data - if self.regex[key]['type'] == 'float': + if self.regex[key]['type'] == 'float' and data: data = apply_f(float, data) elif self.regex[key]['type'] == 'int': data = apply_f(int, data) elif self.regex[key]['type'] == 'date': parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d') data = apply_f(parse_date, data) - self[key] = data + if data: + self[key] = data