more imdb cleanup, add alternative_titles

2010-07-09 10:54:06 +02:00 · 2010-07-09 10:54:06 +02:00 · 6afafa7355
commit 6afafa7355
parent 50ada035cc
2 changed files with 18 additions and 4 deletions
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -15,6 +15,15 @@ import google
 class Imdb(SiteParser):
    regex =  {
        'alternative_titles': {
            'page': 'releaseinfo',
            're': [
                'name="akas".*?<table.*?>(.*?)</table>',
                "td>(.*?)</td>\n\n<td>(.*?)</td>"
            ],
            'type': 'list'
        },
        'cast': {
            'page': 'combined',
            're': '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
@ -76,7 +85,7 @@ class Imdb(SiteParser):
        },
        'plot': {
            'page': 'plotsummary',
-            're': '<p class="plotpar">(.*?)<i>',
+            're': '</div>.*?<p class="plotpar">(.*?)<i>',
            'type': 'string'
        },
        'poster_id': {
@ -145,7 +154,7 @@ class Imdb(SiteParser):
        },
        'year': {
            'page': 'combined',
-            're': '<meta name="og:title" content=".*?\((\d{4})\)"',
+            're': '<meta name="og:title" content=".*?\((\d{4})\).*?"',
            'type': 'int'
        }
    }
@ -160,6 +169,7 @@ class Imdb(SiteParser):
            self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
        else:
            self['runtime'] = 0
        if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
        if 'connections' in self:
            cc={}
            if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring):
@ -168,6 +178,10 @@ class Imdb(SiteParser):
                cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)
            self['connections'] = cc
        for key in ('countries', 'genres'):
            self[key] = filter(lambda x: x.lower() != 'home', self[key])
 def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
    #FIXME: proper file -> title
    title = title.split('-')[0]
--- a/ox/web/siteparser.py
+++ b/ox/web/siteparser.py
@ -49,13 +49,13 @@ class SiteParser(dict):
                    data = [f(d) for d in data]
                else:
                    data = f(data)
-                return data            
+                return data
            if self.regex[key]['type'] == 'float':
                data = apply_f(float, data)
            elif self.regex[key]['type'] == 'int':
                data = apply_f(int, data)
            elif self.regex[key]['type'] == 'date':
-                parse_date = lambda d: datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
+                parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
                data = apply_f(parse_date, data)
            self[key] = data