parse fullcredits

This commit is contained in:
j 2017-02-16 17:16:14 +01:00
parent 51af80545f
commit 67d30ef88e

View file

@ -41,7 +41,6 @@ class Imdb(SiteParser):
"td>(.*?)</td>.*?<td>(.*?)</td>"
],
'type': 'list'
},
'aspectratio': {
'page': 'combined',
@ -304,7 +303,16 @@ class Imdb(SiteParser):
'page': 'combined',
're': '="og:title" content="[^"]*?\((\d{4}).*?"',
'type': 'int'
}
},
'credits': {
'page': 'fullcredits',
're': [
lambda data: data.split('<h4'),
'>(.*?)</h4>.*?(<table.*?</table>)',
lambda data: [d for d in data if d]
],
'type': 'list'
},
}
def read_url(self, url, timeout):
@ -620,6 +628,32 @@ class Imdb(SiteParser):
self['summary'] = self['summary'][0]
self['summary'] = self['summary'].split('</p')[0].strip()
if 'credits' in self:
credits = [
[
strip_tags(d[0].replace(' by', '')).strip(),
[
[
strip_tags(x[0]).strip(),
[t.strip().split(' (')[0].strip() for t in x[2].split(' / ')]
]
for x in
re.compile('<td class="name">(.*?)</td>.*?<td>(.*?)</td>.*?<td class="credit">(.*?)</td>', re.DOTALL).findall(d[1])
]
] for d in self['credits'] if d
]
credits = [c for c in credits if c[1]]
self['credits'] = []
for department, crew in credits:
department = department.replace('(in alphabetical order)', '').strip()
for c in crew:
self['credits'].append({
'name': c[0],
'roles': c[1],
'deparment': department
})
class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1):
_regex = {}