diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 89a326e..2717f84 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -41,7 +41,6 @@ class Imdb(SiteParser): "td>(.*?).*?(.*?)" ], 'type': 'list' - }, 'aspectratio': { 'page': 'combined', @@ -304,7 +303,16 @@ class Imdb(SiteParser): 'page': 'combined', 're': '="og:title" content="[^"]*?\((\d{4}).*?"', 'type': 'int' - } + }, + 'credits': { + 'page': 'fullcredits', + 're': [ + lambda data: data.split('(.*?).*?()', + lambda data: [d for d in data if d] + ], + 'type': 'list' + }, } def read_url(self, url, timeout): @@ -620,6 +628,32 @@ class Imdb(SiteParser): self['summary'] = self['summary'][0] self['summary'] = self['summary'].split('(.*?).*?(.*?).*?(.*?)', re.DOTALL).findall(d[1]) + ] + ] for d in self['credits'] if d + ] + credits = [c for c in credits if c[1]] + + self['credits'] = [] + for department, crew in credits: + department = department.replace('(in alphabetical order)', '').strip() + for c in crew: + self['credits'].append({ + 'name': c[0], + 'roles': c[1], + 'deparment': department + }) + class ImdbCombined(Imdb): def __init__(self, id, timeout=-1): _regex = {}