From 501fe8cd3e1bd33b29eac6ffce85b4404eb46b61 Mon Sep 17 00:00:00 2001 From: j Date: Sun, 3 Jun 2018 14:00:03 +0200 Subject: [PATCH] parse more info from list --- oxdata/movie/imdbids.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index 6a1984e..2cf36b0 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -116,6 +116,19 @@ def update_month(year, month, film_counts): print('%s: count %s, got ids %s' % (key, film_counts[key], r)) save_film_counts(film_counts) +def parse_cast(string): + results = {} + for part in string.split('|'): + cast = iter([t.strip() for t in part.split(':\n')]) + cast = dict(zip(cast, cast)) + for key in cast: + rkey = key.lower() + rkey = { + 'director': 'directors', + 'star': 'stars', + }.get(rkey, rkey) + results[rkey] = cast[key].split(', \n') + return results def update_ids(year, month=None, day=None, sort=None): films = {} @@ -151,7 +164,8 @@ def update_ids(year, month=None, day=None, sort=None): else: print('no article on', '%s&page=%s' % (url, page-2)) break - for header in article.find_class('lister-item-header'): + for content in article.find_class('lister-item-content'): + header = content.find_class('lister-item-header')[0] a = header.xpath('.//a') if 'Episode:' in [ e.text_content() @@ -176,11 +190,34 @@ def update_ids(year, month=None, day=None, sort=None): print(n) print(header.find_class('lister-item-year')[0].text_content()) raise + + text = content.xpath(".//p[contains(@class, 'text-muted')]") + plot = text[1].text_content().strip() + plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip() + if plot == 'Add a Plot': + plot = '' + genre = content.find_class('genre') + if genre: + genre = genre[0].text_content().strip().split(', ') + else: + genre = [] + cast = content.xpath(".//p[contains(@class, '')]") + cast = [t for t in cast if t.attrib.get('class') == ''] + if cast: + cast = parse_cast(cast[0].text_content()) + if id not in films: films[id] = { 'title': title, 'year': y } + if plot: + films[id]['plot'] = plot + if genre: + films[id]['genre'] = genre + if cast: + films[id].update(cast) + #print(key, len(films), 'films') if n: #print(n)