parse fullcredits

2017-02-16 17:16:14 +01:00 · 2017-02-16 17:16:14 +01:00 · 67d30ef88e
commit 67d30ef88e
parent 51af80545f
1 changed files with 36 additions and 2 deletions
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -41,7 +41,6 @@ class Imdb(SiteParser):
                "td>(.*?)</td>.*?<td>(.*?)</td>"
            ],
            'type': 'list'
-        
        },
        'aspectratio': {
            'page': 'combined',
@ -304,7 +303,16 @@ class Imdb(SiteParser):
            'page': 'combined',
            're': '="og:title" content="[^"]*?\((\d{4}).*?"',
            'type': 'int'
-        }
+        },
+        'credits': {
+            'page': 'fullcredits',
+            're': [
+                lambda data: data.split('<h4'),
+                '>(.*?)</h4>.*?(<table.*?</table>)',
+                lambda data: [d for d in data if d]
+            ],
+            'type': 'list'
+        },
    }

    def read_url(self, url, timeout):
@ -620,6 +628,32 @@ class Imdb(SiteParser):
                self['summary'] = self['summary'][0]
            self['summary'] = self['summary'].split('</p')[0].strip()

+        if 'credits' in self:
+            credits = [
+                [
+                    strip_tags(d[0].replace(' by', '')).strip(),
+                    [
+                        [
+                            strip_tags(x[0]).strip(),
+                            [t.strip().split(' (')[0].strip() for t in x[2].split(' / ')]
+                        ]
+                        for x in
+                        re.compile('<td class="name">(.*?)</td>.*?<td>(.*?)</td>.*?<td class="credit">(.*?)</td>', re.DOTALL).findall(d[1])
+                    ]
+                ] for d in self['credits'] if d
+            ]
+            credits = [c for c in credits if c[1]]
+
+            self['credits'] = []
+            for department, crew in credits:
+                department = department.replace('(in alphabetical order)', '').strip()
+                for c in crew:
+                    self['credits'].append({
+                        'name': c[0],
+                        'roles': c[1],
+                        'deparment': department
+                    })
+
 class ImdbCombined(Imdb):
    def __init__(self, id, timeout=-1):
        _regex = {}