diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index 2f7bac1..c0966a1 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -21,6 +21,13 @@ DAY = 24 * 60 * 60 TIMEOUT = 90 * DAY DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb') +DEBUG = False + +def debug(*args, **kwargs): + if DEBUG: + print(*args, **kwargs) + + def read_url(url, timeout): data = ox.web.imdb.read_url(url, unicode=True, timeout=timeout) while '>500 Error - IMDb<' in data: @@ -85,11 +92,9 @@ def update_year(year, film_counts): if film_count != film_counts.get(key): print_info(key, film_count, film_counts) film_counts[key] = film_count - update_month(year, month, film_counts) + update_month(year, month, film_counts, expected=film_count) else: r = update_ids(year) - if r != film_counts[key]: - print('%s: count %s, got ids %s' % (key, film_counts[key], r)) save_film_counts(film_counts) def update_month(year, month, film_counts): @@ -106,23 +111,17 @@ def update_month(year, month, film_counts): print_info(key, film_count, film_counts) film_counts[key] = film_count if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE: - r = update_ids(year, month, day, sort='alpha') - if r != film_counts[key]: - print('%s: count %s, got ids %s' % (key, film_counts[key], r)) + r = update_ids(year, month, day, sort='alpha', expected=film_count) save_film_counts(film_counts) elif film_count > MAX_PER_RANGE: - print(key, '!!!to many per day') + print(key, '!!!to many per day', film_count, key) else: - r = update_ids(year, month, day) - if r != film_counts[key]: - print('%s: count %s, got ids %s' % (key, film_counts[key], r)) + r = update_ids(year, month, day, expected=film_count) save_film_counts(film_counts) if days_total != month_total: print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total)) else: - r = update_ids(year, month) - if r != film_counts[key]: - print('%s: count %s, got ids %s' % (key, film_counts[key], r)) + r = update_ids(year, month, expected=film_count) save_film_counts(film_counts) def parse_cast(string): @@ -139,7 +138,7 @@ def parse_cast(string): results[rkey] = cast[key].split(', \n') return results -def update_ids(year, month=None, day=None, sort=None): +def update_ids(year, month=None, day=None, sort=None, expexted=None): films = {} if day is not None: url = get_day(year, month, day) @@ -157,16 +156,23 @@ def update_ids(year, month=None, day=None, sort=None): ] else: urls = [url] + + if not expexted: + expexted = get_film_count(year, month, day) + for url in urls: data = read_url(url, timeout=TIMEOUT) n = True step = 50 start = 1 while n: - n = re.compile('Next »', re.DOTALL).findall(data) - if n: + start += step + next_link = 'start=%s&ref_=adv_nxt"' % (start) + has_next = re.compile(next_link).findall(data) + if has_next: n = '%s&start=%s' % (url, start) - start += step + else: + n = False doc = lxml.html.fromstring(data) article = doc.find_class('article') if article: @@ -228,21 +234,26 @@ def update_ids(year, month=None, day=None, sort=None): films[id]['genre'] = genre if cast: films[id].update(cast) - - #print(key, len(films), 'films') + if expected and len(films) == expected and sort == 'alpha': + n = False + debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n)) if n: - #print(n) data = read_url(n, timeout=TIMEOUT) path = get_path('ids/%s.json' % key) with open(path, 'w') as fd: json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True) - return len(films) + r = len(films) + if r != expected: + print('%s: got %s, expected %s' % (key, r, expexted)) + return r def save_film_counts(film_counts): with open(get_path('film_counts.json'), 'w') as fd: json.dump(film_counts, fd, indent=4, sort_keys=True) -def update_index(): +def update_index(from_year=None): + if from_year is None: + from_year = 1874 film_counts_json = get_path('film_counts.json') if os.path.exists(film_counts_json): with open(film_counts_json) as fd: @@ -250,7 +261,7 @@ def update_index(): else: film_counts = {} - for year in range(1894, datetime.now().year+1): + for year in range(from_year, datetime.now().year+1): film_count = get_film_count(year) key = '%s' % year if film_count != film_counts.get(key): diff --git a/oxdata/movie/models.py b/oxdata/movie/models.py index 8ee6195..67da419 100644 --- a/oxdata/movie/models.py +++ b/oxdata/movie/models.py @@ -255,13 +255,13 @@ class Imdb(models.Model): j['year'] = int(j['year']) return j -def get_new_ids(timeout=-1): +def get_new_ids(timeout=-1, from_year=None): new_ids_cache = '/tmp/missing.json' if os.path.exists(new_ids_cache): with open(new_ids_cache) as fd: new_ids = set(json.load(fd)) else: - update_index() + update_index(from_year) known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')]) new_ids = get_unknown_ids(known_ids) if new_ids: