only use most common title per type, fixes #1826

2013-08-24 17:30:37 +02:00 · 2013-08-24 17:30:37 +02:00 · a8e76893d3
commit a8e76893d3
parent f429ed8b07
1 changed files with 22 additions and 5 deletions
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -1,15 +1,12 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 import urllib2
 import urllib
 import re
 import os
 import time
 import unicodedata
 import ox
 from ox import find_re, strip_tags
 from ox.normalize import normalize_title, normalize_imdbid
 import ox.cache
 from siteparser import SiteParser
@ -376,8 +373,28 @@ class Imdb(SiteParser):
                    if key in type:
                        stop_word = True
                        break
-                if not stop_word and not type in types:
+                if not stop_word:
-                    types[type] = t[1]
+                    if not type in types:
                        types[type] = []
                    types[type].append(t[1])
        titles = {}
        for type in types:
            for title in types[type]:
                if not title in titles:
                    titles[title] = []
                titles[title].append(type)
        def select_title(type):
            title = types[type][0]
            count = 0
            if len(types[type]) > 1:
                for t in types[type]:
                    if len(titles[t]) > count:
                        count = len(titles[t])
                        title = t
            return title
        types = {type: select_title(type) for type in types}
        regexps = [
            "^.+ \(imdb display title\) \(English title\)$",
            "^USA \(imdb display title\)$",