more imdb cleanup, add alternative_titles
This commit is contained in:
parent
50ada035cc
commit
6afafa7355
2 changed files with 18 additions and 4 deletions
|
@ -15,6 +15,15 @@ import google
|
||||||
|
|
||||||
class Imdb(SiteParser):
|
class Imdb(SiteParser):
|
||||||
regex = {
|
regex = {
|
||||||
|
'alternative_titles': {
|
||||||
|
'page': 'releaseinfo',
|
||||||
|
're': [
|
||||||
|
'name="akas".*?<table.*?>(.*?)</table>',
|
||||||
|
"td>(.*?)</td>\n\n<td>(.*?)</td>"
|
||||||
|
],
|
||||||
|
'type': 'list'
|
||||||
|
|
||||||
|
},
|
||||||
'cast': {
|
'cast': {
|
||||||
'page': 'combined',
|
'page': 'combined',
|
||||||
're': '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
're': '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
||||||
|
@ -76,7 +85,7 @@ class Imdb(SiteParser):
|
||||||
},
|
},
|
||||||
'plot': {
|
'plot': {
|
||||||
'page': 'plotsummary',
|
'page': 'plotsummary',
|
||||||
're': '<p class="plotpar">(.*?)<i>',
|
're': '</div>.*?<p class="plotpar">(.*?)<i>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'poster_id': {
|
'poster_id': {
|
||||||
|
@ -145,7 +154,7 @@ class Imdb(SiteParser):
|
||||||
},
|
},
|
||||||
'year': {
|
'year': {
|
||||||
'page': 'combined',
|
'page': 'combined',
|
||||||
're': '<meta name="og:title" content=".*?\((\d{4})\)"',
|
're': '<meta name="og:title" content=".*?\((\d{4})\).*?"',
|
||||||
'type': 'int'
|
'type': 'int'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -160,6 +169,7 @@ class Imdb(SiteParser):
|
||||||
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
|
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
|
||||||
else:
|
else:
|
||||||
self['runtime'] = 0
|
self['runtime'] = 0
|
||||||
|
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||||
if 'connections' in self:
|
if 'connections' in self:
|
||||||
cc={}
|
cc={}
|
||||||
if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring):
|
if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring):
|
||||||
|
@ -168,6 +178,10 @@ class Imdb(SiteParser):
|
||||||
cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)
|
cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)
|
||||||
self['connections'] = cc
|
self['connections'] = cc
|
||||||
|
|
||||||
|
for key in ('countries', 'genres'):
|
||||||
|
self[key] = filter(lambda x: x.lower() != 'home', self[key])
|
||||||
|
|
||||||
|
|
||||||
def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
|
def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
|
||||||
#FIXME: proper file -> title
|
#FIXME: proper file -> title
|
||||||
title = title.split('-')[0]
|
title = title.split('-')[0]
|
||||||
|
|
|
@ -55,7 +55,7 @@ class SiteParser(dict):
|
||||||
elif self.regex[key]['type'] == 'int':
|
elif self.regex[key]['type'] == 'int':
|
||||||
data = apply_f(int, data)
|
data = apply_f(int, data)
|
||||||
elif self.regex[key]['type'] == 'date':
|
elif self.regex[key]['type'] == 'date':
|
||||||
parse_date = lambda d: datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
|
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
|
||||||
data = apply_f(parse_date, data)
|
data = apply_f(parse_date, data)
|
||||||
self[key] = data
|
self[key] = data
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue