parse fullcredits

2017-02-16 17:16:14 +01:00 · 2017-02-16 17:16:14 +01:00 · 67d30ef88e
commit 67d30ef88e
parent 51af80545f
1 changed files with 36 additions and 2 deletions
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -41,7 +41,6 @@ class Imdb(SiteParser):
                "td>(.*?)</td>.*?<td>(.*?)</td>"
            ],
            'type': 'list'
        },
        'aspectratio': {
            'page': 'combined',
@ -304,7 +303,16 @@ class Imdb(SiteParser):
            'page': 'combined',
            're': '="og:title" content="[^"]*?\((\d{4}).*?"',
            'type': 'int'
-        }
+        },
        'credits': {
            'page': 'fullcredits',
            're': [
                lambda data: data.split('<h4'),
                '>(.*?)</h4>.*?(<table.*?</table>)',
                lambda data: [d for d in data if d]
            ],
            'type': 'list'
        },
    }
    def read_url(self, url, timeout):
@ -620,6 +628,32 @@ class Imdb(SiteParser):
                self['summary'] = self['summary'][0]
            self['summary'] = self['summary'].split('</p')[0].strip()
        if 'credits' in self:
            credits = [
                [
                    strip_tags(d[0].replace(' by', '')).strip(),
                    [
                        [
                            strip_tags(x[0]).strip(),
                            [t.strip().split(' (')[0].strip() for t in x[2].split(' / ')]
                        ]
                        for x in
                        re.compile('<td class="name">(.*?)</td>.*?<td>(.*?)</td>.*?<td class="credit">(.*?)</td>', re.DOTALL).findall(d[1])
                    ]
                ] for d in self['credits'] if d
            ]
            credits = [c for c in credits if c[1]]
            self['credits'] = []
            for department, crew in credits:
                department = department.replace('(in alphabetical order)', '').strip()
                for c in crew:
                    self['credits'].append({
                        'name': c[0],
                        'roles': c[1],
                        'deparment': department
                    })
 class ImdbCombined(Imdb):
    def __init__(self, id, timeout=-1):
        _regex = {}