one better
This commit is contained in:
parent
02d415b5fa
commit
fe06a8c664
1 changed files with 65 additions and 45 deletions
|
@ -76,7 +76,9 @@ def update_year(year, film_counts):
|
||||||
film_counts[key] = film_count
|
film_counts[key] = film_count
|
||||||
update_month(year, month, film_counts)
|
update_month(year, month, film_counts)
|
||||||
else:
|
else:
|
||||||
update_ids(year)
|
r = update_ids(year)
|
||||||
|
if r != film_counts[key]:
|
||||||
|
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
|
|
||||||
def update_month(year, month, film_counts):
|
def update_month(year, month, film_counts):
|
||||||
|
@ -92,19 +94,28 @@ def update_month(year, month, film_counts):
|
||||||
if film_count != film_counts.get(key):
|
if film_count != film_counts.get(key):
|
||||||
print_info(key, film_count, film_counts)
|
print_info(key, film_count, film_counts)
|
||||||
film_counts[key] = film_count
|
film_counts[key] = film_count
|
||||||
if film_count > MAX_PER_RANGE:
|
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
|
||||||
|
r = update_ids(year, month, day, sort='alpha')
|
||||||
|
if r != film_counts[key]:
|
||||||
|
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
||||||
|
save_film_counts(film_counts)
|
||||||
|
elif film_count > MAX_PER_RANGE:
|
||||||
print(key, '!!!to many per day')
|
print(key, '!!!to many per day')
|
||||||
else:
|
else:
|
||||||
update_ids(year, month, day)
|
r = update_ids(year, month, day)
|
||||||
|
if r != film_counts[key]:
|
||||||
|
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
if days_total != month_total:
|
if days_total != month_total:
|
||||||
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
|
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
|
||||||
else:
|
else:
|
||||||
update_ids(year, month)
|
r = update_ids(year, month)
|
||||||
|
if r != film_counts[key]:
|
||||||
|
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
|
|
||||||
|
|
||||||
def update_ids(year, month=None, day=None):
|
def update_ids(year, month=None, day=None, sort=None):
|
||||||
films = {}
|
films = {}
|
||||||
if day is not None:
|
if day is not None:
|
||||||
url = get_day(year, month, day)
|
url = get_day(year, month, day)
|
||||||
|
@ -115,49 +126,58 @@ def update_ids(year, month=None, day=None):
|
||||||
else:
|
else:
|
||||||
url = get_year(year)
|
url = get_year(year)
|
||||||
key = '%04d' % year
|
key = '%04d' % year
|
||||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
if sort == 'alpha':
|
||||||
n = True
|
urls = [
|
||||||
page = 2
|
url.replace('sort=release_date,asc', 'sort=alpha,asc'),
|
||||||
while n:
|
url.replace('sort=release_date,asc', 'sort=alpha,desc'),
|
||||||
n = re.compile('Next »</a>', re.DOTALL).findall(data)
|
]
|
||||||
if n:
|
else:
|
||||||
n = '%s&page=%s' % (url, page)
|
urls = [url]
|
||||||
page += 1
|
for url in urls:
|
||||||
doc = lxml.html.fromstring(data)
|
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
||||||
article = doc.find_class('article')
|
n = True
|
||||||
if article:
|
page = 2
|
||||||
article = article[0]
|
while n:
|
||||||
else:
|
n = re.compile('Next »</a>', re.DOTALL).findall(data)
|
||||||
print('no article on', '%s&page=%s' % (url, page-2))
|
if n:
|
||||||
break
|
n = '%s&page=%s' % (url, page)
|
||||||
for header in article.find_class('lister-item-header'):
|
page += 1
|
||||||
a = header.xpath('.//a')[0]
|
doc = lxml.html.fromstring(data)
|
||||||
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
article = doc.find_class('article')
|
||||||
title = a.text_content().strip()
|
if article:
|
||||||
try:
|
article = article[0]
|
||||||
y = header.find_class('lister-item-year')[0].text_content()
|
else:
|
||||||
y = re.sub('\([^\d]+\)', '', y)
|
print('no article on', '%s&page=%s' % (url, page-2))
|
||||||
y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
|
break
|
||||||
if not y:
|
for header in article.find_class('lister-item-header'):
|
||||||
y = year
|
a = header.xpath('.//a')[0]
|
||||||
else:
|
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
||||||
y = int(y)
|
title = a.text_content().strip()
|
||||||
except:
|
try:
|
||||||
print(n)
|
y = header.find_class('lister-item-year')[0].text_content()
|
||||||
print(header.find_class('lister-item-year')[0].text_content())
|
y = re.sub('\([^\d]+\)', '', y)
|
||||||
raise
|
y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
|
||||||
if id not in films:
|
if not y:
|
||||||
films[id] = {
|
y = year
|
||||||
'title': title,
|
else:
|
||||||
'year': y
|
y = int(y)
|
||||||
}
|
except:
|
||||||
#print(key, len(films), 'films')
|
print(n)
|
||||||
if n:
|
print(header.find_class('lister-item-year')[0].text_content())
|
||||||
#print(n)
|
raise
|
||||||
data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
|
if id not in films:
|
||||||
|
films[id] = {
|
||||||
|
'title': title,
|
||||||
|
'year': y
|
||||||
|
}
|
||||||
|
#print(key, len(films), 'films')
|
||||||
|
if n:
|
||||||
|
#print(n)
|
||||||
|
data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
|
||||||
path = get_path('ids/%s.json' % key)
|
path = get_path('ids/%s.json' % key)
|
||||||
with open(path, 'w') as fd:
|
with open(path, 'w') as fd:
|
||||||
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
|
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
|
||||||
|
return len(films)
|
||||||
|
|
||||||
def save_film_counts(film_counts):
|
def save_film_counts(film_counts):
|
||||||
with open(get_path('film_counts.json'), 'w') as fd:
|
with open(get_path('film_counts.json'), 'w') as fd:
|
||||||
|
|
Loading…
Reference in a new issue