From da72fbdaedf46e124c3fd75d9ebed168009b060f Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sat, 22 Sep 2012 21:40:01 +0200 Subject: [PATCH] imdb: retun title/internationalTitle, #947; cleanup alternative titles, #963 --- ox/text.py | 1 - ox/web/imdb.py | 61 +++++++++++++++++++++++++++++++------------------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/ox/text.py b/ox/text.py index d2d09a8..1ef7a49 100644 --- a/ox/text.py +++ b/ox/text.py @@ -208,7 +208,6 @@ def get_sort_name(name): >>> get_sort_name('Scorsese, Martin') 'Scorsese, Martin' - """ if not ' ' in name or ', ' in name: return name diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 4cfd419..bb8a7a1 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -232,7 +232,7 @@ class Imdb(SiteParser): 're': '(TV series)', 'type': 'string' }, - 'originalTitle': { + 'title': { 'page': 'combined', 're': '

(.*?) ', 'type': 'string' @@ -283,23 +283,28 @@ class Imdb(SiteParser): time.sleep(1) super(Imdb, self).__init__(0) - #only list one country per alternative title - - def is_international_title(t): - if 'script title' in t[1].lower(): return False - if 'recut version' in t[1].lower(): return False - if 'working title' in t[1].lower(): return False - if 'complete title' in t[1].lower(): return False - if 'usa (imdb display title)' in t[1].lower(): return True - if t[1].lower() == 'usa': return True - if 'international (english title)' in t[1].lower(): return True - #fails if orignial is english... Japan (English title) - #if 'english title' in t[1].lower(): return True - return False - ititle = filter(is_international_title, self.get('alternativeTitles', [])) - if ititle: - self['englishTitle'] = ititle[0][0] - self['title'] = self.get('englishTitle', self['originalTitle']) + for t in self.get('alternativeTitles', []): + for type in t[1].lower().split('/'): + type = type.strip() + for regexp in ( + "^.+ \(imdb display title\) \(English title\)$", + "^International \(English title\)$", + "^.+ \(English title\)$", + "^International \(.+\) \(English title\)$", + "^.+ \(.+\) \(English title\)$", + "^USA$", + "^UK$", + "^USA \(.+\)$", + "^UK \(.+\)$", + "^International \(.+ title\)$", + ): + if re.compile(regexp).findall(type): + self['internationalTitle'] = t[0] + break + if 'internationalTitle' in self: + break + if 'internationalTitle' in self: + break def cleanup_title(title): if title.startswith('"') and title.endswith('"'): @@ -307,17 +312,27 @@ class Imdb(SiteParser): title = re.sub('\(\#[.\d]+\)', '', title) return title.strip() - for t in ('title', 'englishTitle', 'originalTitle'): + for t in ('title', 'internationalTitle'): if t in self: self[t] = cleanup_title(self[t]) if 'alternativeTitles' in self: if len(self['alternativeTitles']) == 2 and \ isinstance(self['alternativeTitles'][0], basestring): self['alternativeTitles'] = [self['alternativeTitles']] - self['alternativeTitles'] = [[cleanup_title(t[0]), - t[1].split(' / ')[0].split('(')[0].strip()] - for t in self['alternativeTitles']] - #self[t] = re.sub('\(\#[.\d]+\)', '', self[t]) + alt = {} + for t in self['alternativeTitles']: + title = cleanup_title(t[0]) + if title not in (self.get('title'), self.get('internationalTitle')): + if title not in alt: + alt[title] = [] + for c in t[1].split('/'): + c = c.replace('International', '').split('(')[0].strip() + if c: + alt[title].append(c) + self['alternativeTitles'] = [] + for t in sorted(alt, lambda a, b: cmp(sorted(alt[a]), sorted(alt[b]))): + if alt[t]: + self['alternativeTitles'].append((t, sorted(alt[t]))) if 'runtime' in self and self['runtime']: if 'min' in self['runtime']: base=60