diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index 94b8453..c0966a1 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -22,7 +22,6 @@ TIMEOUT = 90 * DAY DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb') DEBUG = False -base_url = 'https://www.imdb.com' def debug(*args, **kwargs): if DEBUG: @@ -38,6 +37,7 @@ def read_url(url, timeout): return data def get_range(from_, to): + base_url = 'https://www.imdb.com' url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to) return url @@ -63,7 +63,7 @@ def get_film_count(year, month=None, day=None): data = read_url(url, timeout=TIMEOUT) total = re.compile('1-50 of ([\d,]+?) titles.').findall(data) if not total: - total = re.compile('([\d,]+) titles.', re.DOTALL).findall(data) + total = re.compile(' ([\d,]+) titles\n', re.DOTALL).findall(data) if total: return int(total[0].replace(',', '')) print('no movies', url) @@ -113,6 +113,8 @@ def update_month(year, month, film_counts): if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE: r = update_ids(year, month, day, sort='alpha', expected=film_count) save_film_counts(film_counts) + elif film_count > MAX_PER_RANGE: + print(key, '!!!to many per day', film_count, key) else: r = update_ids(year, month, day, expected=film_count) save_film_counts(film_counts) @@ -166,13 +168,9 @@ def update_ids(year, month=None, day=None, sort=None, expexted=None): while n: start += step next_link = 'start=%s&ref_=adv_nxt"' % (start) - after_link = 'href="(.*?after=.*?&ref_=adv_nxt)"' has_next = re.compile(next_link).findall(data) - has_after = re.compile(after_link).findall(data) if has_next: n = '%s&start=%s' % (url, start) - elif sort != 'alpha' and start > MAX_PER_RANGE and has_after: - n = '%s%s' % (base_url, has_after[0]) else: n = False doc = lxml.html.fromstring(data)