more imdb cleanup, add alternative_titles

This commit is contained in:
j 2010-07-09 10:54:06 +02:00
parent 50ada035cc
commit 6afafa7355
2 changed files with 18 additions and 4 deletions

View file

@ -15,6 +15,15 @@ import google
class Imdb(SiteParser): class Imdb(SiteParser):
regex = { regex = {
'alternative_titles': {
'page': 'releaseinfo',
're': [
'name="akas".*?<table.*?>(.*?)</table>',
"td>(.*?)</td>\n\n<td>(.*?)</td>"
],
'type': 'list'
},
'cast': { 'cast': {
'page': 'combined', 'page': 'combined',
're': '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>', 're': '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
@ -76,7 +85,7 @@ class Imdb(SiteParser):
}, },
'plot': { 'plot': {
'page': 'plotsummary', 'page': 'plotsummary',
're': '<p class="plotpar">(.*?)<i>', 're': '</div>.*?<p class="plotpar">(.*?)<i>',
'type': 'string' 'type': 'string'
}, },
'poster_id': { 'poster_id': {
@ -145,7 +154,7 @@ class Imdb(SiteParser):
}, },
'year': { 'year': {
'page': 'combined', 'page': 'combined',
're': '<meta name="og:title" content=".*?\((\d{4})\)"', 're': '<meta name="og:title" content=".*?\((\d{4})\).*?"',
'type': 'int' 'type': 'int'
} }
} }
@ -160,6 +169,7 @@ class Imdb(SiteParser):
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
else: else:
self['runtime'] = 0 self['runtime'] = 0
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'connections' in self: if 'connections' in self:
cc={} cc={}
if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring): if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring):
@ -168,6 +178,10 @@ class Imdb(SiteParser):
cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data) cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)
self['connections'] = cc self['connections'] = cc
for key in ('countries', 'genres'):
self[key] = filter(lambda x: x.lower() != 'home', self[key])
def guess(title, director='', timeout=google.DEFAULT_TIMEOUT): def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
#FIXME: proper file -> title #FIXME: proper file -> title
title = title.split('-')[0] title = title.split('-')[0]

View file

@ -49,13 +49,13 @@ class SiteParser(dict):
data = [f(d) for d in data] data = [f(d) for d in data]
else: else:
data = f(data) data = f(data)
return data return data
if self.regex[key]['type'] == 'float': if self.regex[key]['type'] == 'float':
data = apply_f(float, data) data = apply_f(float, data)
elif self.regex[key]['type'] == 'int': elif self.regex[key]['type'] == 'int':
data = apply_f(int, data) data = apply_f(int, data)
elif self.regex[key]['type'] == 'date': elif self.regex[key]['type'] == 'date':
parse_date = lambda d: datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d') parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
data = apply_f(parse_date, data) data = apply_f(parse_date, data)
self[key] = data self[key] = data