one better

This commit is contained in:
j 2018-05-03 12:11:51 +02:00
parent 02d415b5fa
commit fe06a8c664

View file

@ -76,7 +76,9 @@ def update_year(year, film_counts):
film_counts[key] = film_count film_counts[key] = film_count
update_month(year, month, film_counts) update_month(year, month, film_counts)
else: else:
update_ids(year) r = update_ids(year)
if r != film_counts[key]:
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
save_film_counts(film_counts) save_film_counts(film_counts)
def update_month(year, month, film_counts): def update_month(year, month, film_counts):
@ -92,19 +94,28 @@ def update_month(year, month, film_counts):
if film_count != film_counts.get(key): if film_count != film_counts.get(key):
print_info(key, film_count, film_counts) print_info(key, film_count, film_counts)
film_counts[key] = film_count film_counts[key] = film_count
if film_count > MAX_PER_RANGE: if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
r = update_ids(year, month, day, sort='alpha')
if r != film_counts[key]:
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
save_film_counts(film_counts)
elif film_count > MAX_PER_RANGE:
print(key, '!!!to many per day') print(key, '!!!to many per day')
else: else:
update_ids(year, month, day) r = update_ids(year, month, day)
if r != film_counts[key]:
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
save_film_counts(film_counts) save_film_counts(film_counts)
if days_total != month_total: if days_total != month_total:
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total)) print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
else: else:
update_ids(year, month) r = update_ids(year, month)
if r != film_counts[key]:
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
save_film_counts(film_counts) save_film_counts(film_counts)
def update_ids(year, month=None, day=None): def update_ids(year, month=None, day=None, sort=None):
films = {} films = {}
if day is not None: if day is not None:
url = get_day(year, month, day) url = get_day(year, month, day)
@ -115,49 +126,58 @@ def update_ids(year, month=None, day=None):
else: else:
url = get_year(year) url = get_year(year)
key = '%04d' % year key = '%04d' % year
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) if sort == 'alpha':
n = True urls = [
page = 2 url.replace('sort=release_date,asc', 'sort=alpha,asc'),
while n: url.replace('sort=release_date,asc', 'sort=alpha,desc'),
n = re.compile('Next &#187;</a>', re.DOTALL).findall(data) ]
if n: else:
n = '%s&page=%s' % (url, page) urls = [url]
page += 1 for url in urls:
doc = lxml.html.fromstring(data) data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
article = doc.find_class('article') n = True
if article: page = 2
article = article[0] while n:
else: n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
print('no article on', '%s&page=%s' % (url, page-2)) if n:
break n = '%s&page=%s' % (url, page)
for header in article.find_class('lister-item-header'): page += 1
a = header.xpath('.//a')[0] doc = lxml.html.fromstring(data)
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] article = doc.find_class('article')
title = a.text_content().strip() if article:
try: article = article[0]
y = header.find_class('lister-item-year')[0].text_content() else:
y = re.sub('\([^\d]+\)', '', y) print('no article on', '%s&page=%s' % (url, page-2))
y = y.rsplit('(', 1)[-1].split(')')[0].split('')[0].split(' ')[0].strip() break
if not y: for header in article.find_class('lister-item-header'):
y = year a = header.xpath('.//a')[0]
else: id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
y = int(y) title = a.text_content().strip()
except: try:
print(n) y = header.find_class('lister-item-year')[0].text_content()
print(header.find_class('lister-item-year')[0].text_content()) y = re.sub('\([^\d]+\)', '', y)
raise y = y.rsplit('(', 1)[-1].split(')')[0].split('')[0].split(' ')[0].strip()
if id not in films: if not y:
films[id] = { y = year
'title': title, else:
'year': y y = int(y)
} except:
#print(key, len(films), 'films') print(n)
if n: print(header.find_class('lister-item-year')[0].text_content())
#print(n) raise
data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT) if id not in films:
films[id] = {
'title': title,
'year': y
}
#print(key, len(films), 'films')
if n:
#print(n)
data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
path = get_path('ids/%s.json' % key) path = get_path('ids/%s.json' % key)
with open(path, 'w') as fd: with open(path, 'w') as fd:
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True) json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
return len(films)
def save_film_counts(film_counts): def save_film_counts(film_counts):
with open(get_path('film_counts.json'), 'w') as fd: with open(get_path('film_counts.json'), 'w') as fd: