From fe06a8c6645396f6c6149b896f9509e3c556ef7d Mon Sep 17 00:00:00 2001 From: j Date: Thu, 3 May 2018 12:11:51 +0200 Subject: [PATCH] one better --- oxdata/movie/imdbids.py | 110 ++++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 45 deletions(-) diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index 671ea41..b102742 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -76,7 +76,9 @@ def update_year(year, film_counts): film_counts[key] = film_count update_month(year, month, film_counts) else: - update_ids(year) + r = update_ids(year) + if r != film_counts[key]: + print('%s: count %s, got ids %s' % (key, film_counts[key], r)) save_film_counts(film_counts) def update_month(year, month, film_counts): @@ -92,19 +94,28 @@ def update_month(year, month, film_counts): if film_count != film_counts.get(key): print_info(key, film_count, film_counts) film_counts[key] = film_count - if film_count > MAX_PER_RANGE: + if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE: + r = update_ids(year, month, day, sort='alpha') + if r != film_counts[key]: + print('%s: count %s, got ids %s' % (key, film_counts[key], r)) + save_film_counts(film_counts) + elif film_count > MAX_PER_RANGE: print(key, '!!!to many per day') else: - update_ids(year, month, day) + r = update_ids(year, month, day) + if r != film_counts[key]: + print('%s: count %s, got ids %s' % (key, film_counts[key], r)) save_film_counts(film_counts) if days_total != month_total: print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total)) else: - update_ids(year, month) + r = update_ids(year, month) + if r != film_counts[key]: + print('%s: count %s, got ids %s' % (key, film_counts[key], r)) save_film_counts(film_counts) -def update_ids(year, month=None, day=None): +def update_ids(year, month=None, day=None, sort=None): films = {} if day is not None: url = get_day(year, month, day) @@ -115,49 +126,58 @@ def update_ids(year, month=None, day=None): else: url = get_year(year) key = '%04d' % year - data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) - n = True - page = 2 - while n: - n = re.compile('Next »', re.DOTALL).findall(data) - if n: - n = '%s&page=%s' % (url, page) - page += 1 - doc = lxml.html.fromstring(data) - article = doc.find_class('article') - if article: - article = article[0] - else: - print('no article on', '%s&page=%s' % (url, page-2)) - break - for header in article.find_class('lister-item-header'): - a = header.xpath('.//a')[0] - id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] - title = a.text_content().strip() - try: - y = header.find_class('lister-item-year')[0].text_content() - y = re.sub('\([^\d]+\)', '', y) - y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip() - if not y: - y = year - else: - y = int(y) - except: - print(n) - print(header.find_class('lister-item-year')[0].text_content()) - raise - if id not in films: - films[id] = { - 'title': title, - 'year': y - } - #print(key, len(films), 'films') - if n: - #print(n) - data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT) + if sort == 'alpha': + urls = [ + url.replace('sort=release_date,asc', 'sort=alpha,asc'), + url.replace('sort=release_date,asc', 'sort=alpha,desc'), + ] + else: + urls = [url] + for url in urls: + data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) + n = True + page = 2 + while n: + n = re.compile('Next »', re.DOTALL).findall(data) + if n: + n = '%s&page=%s' % (url, page) + page += 1 + doc = lxml.html.fromstring(data) + article = doc.find_class('article') + if article: + article = article[0] + else: + print('no article on', '%s&page=%s' % (url, page-2)) + break + for header in article.find_class('lister-item-header'): + a = header.xpath('.//a')[0] + id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] + title = a.text_content().strip() + try: + y = header.find_class('lister-item-year')[0].text_content() + y = re.sub('\([^\d]+\)', '', y) + y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip() + if not y: + y = year + else: + y = int(y) + except: + print(n) + print(header.find_class('lister-item-year')[0].text_content()) + raise + if id not in films: + films[id] = { + 'title': title, + 'year': y + } + #print(key, len(films), 'films') + if n: + #print(n) + data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT) path = get_path('ids/%s.json' % key) with open(path, 'w') as fd: json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True) + return len(films) def save_film_counts(film_counts): with open(get_path('film_counts.json'), 'w') as fd: