more imdb cleanup, add alternative_titles
This commit is contained in:
parent
50ada035cc
commit
6afafa7355
2 changed files with 18 additions and 4 deletions
|
@ -15,6 +15,15 @@ import google
|
|||
|
||||
class Imdb(SiteParser):
|
||||
regex = {
|
||||
'alternative_titles': {
|
||||
'page': 'releaseinfo',
|
||||
're': [
|
||||
'name="akas".*?<table.*?>(.*?)</table>',
|
||||
"td>(.*?)</td>\n\n<td>(.*?)</td>"
|
||||
],
|
||||
'type': 'list'
|
||||
|
||||
},
|
||||
'cast': {
|
||||
'page': 'combined',
|
||||
're': '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
||||
|
@ -76,7 +85,7 @@ class Imdb(SiteParser):
|
|||
},
|
||||
'plot': {
|
||||
'page': 'plotsummary',
|
||||
're': '<p class="plotpar">(.*?)<i>',
|
||||
're': '</div>.*?<p class="plotpar">(.*?)<i>',
|
||||
'type': 'string'
|
||||
},
|
||||
'poster_id': {
|
||||
|
@ -145,7 +154,7 @@ class Imdb(SiteParser):
|
|||
},
|
||||
'year': {
|
||||
'page': 'combined',
|
||||
're': '<meta name="og:title" content=".*?\((\d{4})\)"',
|
||||
're': '<meta name="og:title" content=".*?\((\d{4})\).*?"',
|
||||
'type': 'int'
|
||||
}
|
||||
}
|
||||
|
@ -160,6 +169,7 @@ class Imdb(SiteParser):
|
|||
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
|
||||
else:
|
||||
self['runtime'] = 0
|
||||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||
if 'connections' in self:
|
||||
cc={}
|
||||
if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring):
|
||||
|
@ -168,6 +178,10 @@ class Imdb(SiteParser):
|
|||
cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)
|
||||
self['connections'] = cc
|
||||
|
||||
for key in ('countries', 'genres'):
|
||||
self[key] = filter(lambda x: x.lower() != 'home', self[key])
|
||||
|
||||
|
||||
def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
|
||||
#FIXME: proper file -> title
|
||||
title = title.split('-')[0]
|
||||
|
|
|
@ -49,13 +49,13 @@ class SiteParser(dict):
|
|||
data = [f(d) for d in data]
|
||||
else:
|
||||
data = f(data)
|
||||
return data
|
||||
return data
|
||||
if self.regex[key]['type'] == 'float':
|
||||
data = apply_f(float, data)
|
||||
elif self.regex[key]['type'] == 'int':
|
||||
data = apply_f(int, data)
|
||||
elif self.regex[key]['type'] == 'date':
|
||||
parse_date = lambda d: datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
|
||||
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
|
||||
data = apply_f(parse_date, data)
|
||||
self[key] = data
|
||||
|
||||
|
|
Loading…
Reference in a new issue