parse more info from list
This commit is contained in:
parent
844d25008b
commit
501fe8cd3e
1 changed files with 38 additions and 1 deletions
|
@ -116,6 +116,19 @@ def update_month(year, month, film_counts):
|
|||
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
||||
save_film_counts(film_counts)
|
||||
|
||||
def parse_cast(string):
|
||||
results = {}
|
||||
for part in string.split('|'):
|
||||
cast = iter([t.strip() for t in part.split(':\n')])
|
||||
cast = dict(zip(cast, cast))
|
||||
for key in cast:
|
||||
rkey = key.lower()
|
||||
rkey = {
|
||||
'director': 'directors',
|
||||
'star': 'stars',
|
||||
}.get(rkey, rkey)
|
||||
results[rkey] = cast[key].split(', \n')
|
||||
return results
|
||||
|
||||
def update_ids(year, month=None, day=None, sort=None):
|
||||
films = {}
|
||||
|
@ -151,7 +164,8 @@ def update_ids(year, month=None, day=None, sort=None):
|
|||
else:
|
||||
print('no article on', '%s&page=%s' % (url, page-2))
|
||||
break
|
||||
for header in article.find_class('lister-item-header'):
|
||||
for content in article.find_class('lister-item-content'):
|
||||
header = content.find_class('lister-item-header')[0]
|
||||
a = header.xpath('.//a')
|
||||
if 'Episode:' in [
|
||||
e.text_content()
|
||||
|
@ -176,11 +190,34 @@ def update_ids(year, month=None, day=None, sort=None):
|
|||
print(n)
|
||||
print(header.find_class('lister-item-year')[0].text_content())
|
||||
raise
|
||||
|
||||
text = content.xpath(".//p[contains(@class, 'text-muted')]")
|
||||
plot = text[1].text_content().strip()
|
||||
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
|
||||
if plot == 'Add a Plot':
|
||||
plot = ''
|
||||
genre = content.find_class('genre')
|
||||
if genre:
|
||||
genre = genre[0].text_content().strip().split(', ')
|
||||
else:
|
||||
genre = []
|
||||
cast = content.xpath(".//p[contains(@class, '')]")
|
||||
cast = [t for t in cast if t.attrib.get('class') == '']
|
||||
if cast:
|
||||
cast = parse_cast(cast[0].text_content())
|
||||
|
||||
if id not in films:
|
||||
films[id] = {
|
||||
'title': title,
|
||||
'year': y
|
||||
}
|
||||
if plot:
|
||||
films[id]['plot'] = plot
|
||||
if genre:
|
||||
films[id]['genre'] = genre
|
||||
if cast:
|
||||
films[id].update(cast)
|
||||
|
||||
#print(key, len(films), 'films')
|
||||
if n:
|
||||
#print(n)
|
||||
|
|
Loading…
Reference in a new issue