parse more info from list
This commit is contained in:
parent
844d25008b
commit
501fe8cd3e
1 changed files with 38 additions and 1 deletions
|
@ -116,6 +116,19 @@ def update_month(year, month, film_counts):
|
||||||
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
|
|
||||||
|
def parse_cast(string):
|
||||||
|
results = {}
|
||||||
|
for part in string.split('|'):
|
||||||
|
cast = iter([t.strip() for t in part.split(':\n')])
|
||||||
|
cast = dict(zip(cast, cast))
|
||||||
|
for key in cast:
|
||||||
|
rkey = key.lower()
|
||||||
|
rkey = {
|
||||||
|
'director': 'directors',
|
||||||
|
'star': 'stars',
|
||||||
|
}.get(rkey, rkey)
|
||||||
|
results[rkey] = cast[key].split(', \n')
|
||||||
|
return results
|
||||||
|
|
||||||
def update_ids(year, month=None, day=None, sort=None):
|
def update_ids(year, month=None, day=None, sort=None):
|
||||||
films = {}
|
films = {}
|
||||||
|
@ -151,7 +164,8 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
else:
|
else:
|
||||||
print('no article on', '%s&page=%s' % (url, page-2))
|
print('no article on', '%s&page=%s' % (url, page-2))
|
||||||
break
|
break
|
||||||
for header in article.find_class('lister-item-header'):
|
for content in article.find_class('lister-item-content'):
|
||||||
|
header = content.find_class('lister-item-header')[0]
|
||||||
a = header.xpath('.//a')
|
a = header.xpath('.//a')
|
||||||
if 'Episode:' in [
|
if 'Episode:' in [
|
||||||
e.text_content()
|
e.text_content()
|
||||||
|
@ -176,11 +190,34 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
print(n)
|
print(n)
|
||||||
print(header.find_class('lister-item-year')[0].text_content())
|
print(header.find_class('lister-item-year')[0].text_content())
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
text = content.xpath(".//p[contains(@class, 'text-muted')]")
|
||||||
|
plot = text[1].text_content().strip()
|
||||||
|
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
|
||||||
|
if plot == 'Add a Plot':
|
||||||
|
plot = ''
|
||||||
|
genre = content.find_class('genre')
|
||||||
|
if genre:
|
||||||
|
genre = genre[0].text_content().strip().split(', ')
|
||||||
|
else:
|
||||||
|
genre = []
|
||||||
|
cast = content.xpath(".//p[contains(@class, '')]")
|
||||||
|
cast = [t for t in cast if t.attrib.get('class') == '']
|
||||||
|
if cast:
|
||||||
|
cast = parse_cast(cast[0].text_content())
|
||||||
|
|
||||||
if id not in films:
|
if id not in films:
|
||||||
films[id] = {
|
films[id] = {
|
||||||
'title': title,
|
'title': title,
|
||||||
'year': y
|
'year': y
|
||||||
}
|
}
|
||||||
|
if plot:
|
||||||
|
films[id]['plot'] = plot
|
||||||
|
if genre:
|
||||||
|
films[id]['genre'] = genre
|
||||||
|
if cast:
|
||||||
|
films[id].update(cast)
|
||||||
|
|
||||||
#print(key, len(films), 'films')
|
#print(key, len(films), 'films')
|
||||||
if n:
|
if n:
|
||||||
#print(n)
|
#print(n)
|
||||||
|
|
Loading…
Reference in a new issue