parse more info from list

This commit is contained in:
j 2018-06-03 14:00:03 +02:00
parent 844d25008b
commit 501fe8cd3e

View file

@ -116,6 +116,19 @@ def update_month(year, month, film_counts):
print('%s: count %s, got ids %s' % (key, film_counts[key], r)) print('%s: count %s, got ids %s' % (key, film_counts[key], r))
save_film_counts(film_counts) save_film_counts(film_counts)
def parse_cast(string):
results = {}
for part in string.split('|'):
cast = iter([t.strip() for t in part.split(':\n')])
cast = dict(zip(cast, cast))
for key in cast:
rkey = key.lower()
rkey = {
'director': 'directors',
'star': 'stars',
}.get(rkey, rkey)
results[rkey] = cast[key].split(', \n')
return results
def update_ids(year, month=None, day=None, sort=None): def update_ids(year, month=None, day=None, sort=None):
films = {} films = {}
@ -151,7 +164,8 @@ def update_ids(year, month=None, day=None, sort=None):
else: else:
print('no article on', '%s&page=%s' % (url, page-2)) print('no article on', '%s&page=%s' % (url, page-2))
break break
for header in article.find_class('lister-item-header'): for content in article.find_class('lister-item-content'):
header = content.find_class('lister-item-header')[0]
a = header.xpath('.//a') a = header.xpath('.//a')
if 'Episode:' in [ if 'Episode:' in [
e.text_content() e.text_content()
@ -176,11 +190,34 @@ def update_ids(year, month=None, day=None, sort=None):
print(n) print(n)
print(header.find_class('lister-item-year')[0].text_content()) print(header.find_class('lister-item-year')[0].text_content())
raise raise
text = content.xpath(".//p[contains(@class, 'text-muted')]")
plot = text[1].text_content().strip()
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
if plot == 'Add a Plot':
plot = ''
genre = content.find_class('genre')
if genre:
genre = genre[0].text_content().strip().split(', ')
else:
genre = []
cast = content.xpath(".//p[contains(@class, '')]")
cast = [t for t in cast if t.attrib.get('class') == '']
if cast:
cast = parse_cast(cast[0].text_content())
if id not in films: if id not in films:
films[id] = { films[id] = {
'title': title, 'title': title,
'year': y 'year': y
} }
if plot:
films[id]['plot'] = plot
if genre:
films[id]['genre'] = genre
if cast:
films[id].update(cast)
#print(key, len(films), 'films') #print(key, len(films), 'films')
if n: if n:
#print(n) #print(n)