This commit is contained in:
j 2019-08-05 13:18:39 +02:00
parent ee9e430ef8
commit 785550c753

View file

@ -110,12 +110,8 @@ def update_month(year, month, film_counts):
if film_count != film_counts.get(key): if film_count != film_counts.get(key):
print_info(key, film_count, film_counts) print_info(key, film_count, film_counts)
film_counts[key] = film_count film_counts[key] = film_count
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE: r = update_ids(year, month, day, expected=film_count)
r = update_ids(year, month, day, sort='alpha', expected=film_count) save_film_counts(film_counts)
save_film_counts(film_counts)
else:
r = update_ids(year, month, day, expected=film_count)
save_film_counts(film_counts)
if days_total != month_total: if days_total != month_total:
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total)) print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
else: else:
@ -136,6 +132,69 @@ def parse_cast(string):
results[rkey] = cast[key].split(', \n') results[rkey] = cast[key].split(', \n')
return results return results
def get_films(data):
films = []
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
return films
for content in article.find_class('lister-item-content'):
header = content.find_class('lister-item-header')[0]
a = header.xpath('.//a')
if 'Episode:' in [
e.text_content()
for e in header.xpath(".//small")
] and len(a) > 1:
title = a[0].text_content().strip() + ': '
a = a[1]
else:
title = ''
a = a[0]
id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0]
title += a.text_content().strip()
try:
y = header.find_class('lister-item-year')[0].text_content()
y = re.sub('\([^\d]+\)', '', y)
y = y.rsplit('(', 1)[-1].split(')')[0].split('')[0].split(' ')[0].strip()
if not y:
y = year
else:
y = int(y)
except:
print(n)
print(header.find_class('lister-item-year')[0].text_content())
raise
text = content.xpath(".//p[contains(@class, 'text-muted')]")
plot = text[1].text_content().strip()
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
if plot == 'Add a Plot':
plot = ''
genre = content.find_class('genre')
if genre:
genre = genre[0].text_content().strip().split(', ')
else:
genre = []
cast = content.xpath(".//p[contains(@class, '')]")
cast = [t for t in cast if t.attrib.get('class') == '']
if cast:
cast = parse_cast(cast[0].text_content())
film = {
'title': title,
'year': y
}
if plot:
film['plot'] = plot
if genre:
film['genre'] = genre
if cast:
film.update(cast)
films.append((id, film))
return films
def update_ids(year, month=None, day=None, sort=None, expected=None): def update_ids(year, month=None, day=None, sort=None, expected=None):
films = {} films = {}
if day is not None: if day is not None:
@ -147,13 +206,8 @@ def update_ids(year, month=None, day=None, sort=None, expected=None):
else: else:
url = get_year(year) url = get_year(year)
key = '%04d' % year key = '%04d' % year
if sort == 'alpha':
urls = [ urls = [url]
url.replace('sort=release_date,asc', 'sort=alpha,asc'),
url.replace('sort=release_date,asc', 'sort=alpha,desc'),
]
else:
urls = [url]
if not expected: if not expected:
expected = get_film_count(year, month, day) expected = get_film_count(year, month, day)
@ -171,73 +225,13 @@ def update_ids(year, month=None, day=None, sort=None, expected=None):
has_after = re.compile(after_link).findall(data) has_after = re.compile(after_link).findall(data)
if has_next: if has_next:
n = '%s&start=%s' % (url, start) n = '%s&start=%s' % (url, start)
elif sort != 'alpha' and has_after: elif has_after:
n = '%s%s' % (base_url, has_after[0]) n = '%s%s' % (base_url, has_after[0])
else: else:
n = False n = False
doc = lxml.html.fromstring(data) for id, film in get_films(data):
article = doc.find_class('article')
if article:
article = article[0]
else:
print('no article on', '%s&start=%s' % (url, start - 2*step))
ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step))
break
for content in article.find_class('lister-item-content'):
header = content.find_class('lister-item-header')[0]
a = header.xpath('.//a')
if 'Episode:' in [
e.text_content()
for e in header.xpath(".//small")
] and len(a) > 1:
title = a[0].text_content().strip() + ': '
a = a[1]
else:
title = ''
a = a[0]
id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0]
title += a.text_content().strip()
try:
y = header.find_class('lister-item-year')[0].text_content()
y = re.sub('\([^\d]+\)', '', y)
y = y.rsplit('(', 1)[-1].split(')')[0].split('')[0].split(' ')[0].strip()
if not y:
y = year
else:
y = int(y)
except:
print(n)
print(header.find_class('lister-item-year')[0].text_content())
raise
text = content.xpath(".//p[contains(@class, 'text-muted')]")
plot = text[1].text_content().strip()
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
if plot == 'Add a Plot':
plot = ''
genre = content.find_class('genre')
if genre:
genre = genre[0].text_content().strip().split(', ')
else:
genre = []
cast = content.xpath(".//p[contains(@class, '')]")
cast = [t for t in cast if t.attrib.get('class') == '']
if cast:
cast = parse_cast(cast[0].text_content())
if id not in films: if id not in films:
films[id] = { films[id] = film
'title': title,
'year': y
}
if plot:
films[id]['plot'] = plot
if genre:
films[id]['genre'] = genre
if cast:
films[id].update(cast)
if expected and len(films) == expected and sort == 'alpha':
n = False
debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n)) debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n))
if n: if n:
data = read_url(n, timeout=TIMEOUT) data = read_url(n, timeout=TIMEOUT)