parse more info from list

2018-06-03 14:00:03 +02:00 · 2018-06-03 14:00:03 +02:00 · 501fe8cd3e
commit 501fe8cd3e
parent 844d25008b
1 changed files with 38 additions and 1 deletions
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@ -116,6 +116,19 @@ def update_month(year, month, film_counts):
            print('%s: count %s, got ids %s' % (key, film_counts[key], r))
        save_film_counts(film_counts)

+def parse_cast(string):
+    results = {}
+    for part in string.split('|'):
+        cast = iter([t.strip() for t in part.split(':\n')])
+        cast = dict(zip(cast, cast))
+        for key in cast:
+            rkey = key.lower()
+            rkey = {
+                'director': 'directors',
+                'star': 'stars',
+            }.get(rkey, rkey)
+            results[rkey] = cast[key].split(', \n')
+    return results

 def update_ids(year, month=None, day=None, sort=None):
    films = {}
@ -151,7 +164,8 @@ def update_ids(year, month=None, day=None, sort=None):
            else:
                print('no article on', '%s&page=%s' % (url, page-2))
                break
-            for header in article.find_class('lister-item-header'):
+            for content in article.find_class('lister-item-content'):
+                header = content.find_class('lister-item-header')[0]
                a = header.xpath('.//a')
                if 'Episode:' in [
                    e.text_content()
@ -176,11 +190,34 @@ def update_ids(year, month=None, day=None, sort=None):
                    print(n)
                    print(header.find_class('lister-item-year')[0].text_content())
                    raise
+
+                text = content.xpath(".//p[contains(@class, 'text-muted')]")
+                plot = text[1].text_content().strip()
+                plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
+                if plot == 'Add a Plot':
+                    plot = ''
+                genre = content.find_class('genre')
+                if genre:
+                    genre = genre[0].text_content().strip().split(', ')
+                else:
+                    genre = []
+                cast = content.xpath(".//p[contains(@class, '')]")
+                cast = [t for t in cast if t.attrib.get('class') == '']
+                if cast:
+                    cast = parse_cast(cast[0].text_content())
+
                if id not in films:
                    films[id] = {
                        'title': title,
                        'year': y
                    }
+                    if plot:
+                        films[id]['plot'] = plot
+                    if genre:
+                        films[id]['genre'] = genre
+                    if cast:
+                        films[id].update(cast)
+
            #print(key, len(films), 'films')
            if n:
                #print(n)