Compare commits

...

2 Commits

Author SHA1 Message Date
j a131c78482 continue after 2019-08-05 12:13:18 +02:00
j d014fb7bca fix count < 50 2019-08-05 10:33:29 +02:00
1 changed files with 6 additions and 4 deletions

View File

@ -22,6 +22,7 @@ TIMEOUT = 90 * DAY
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
DEBUG = False
base_url = 'https://www.imdb.com'
def debug(*args, **kwargs):
if DEBUG:
@ -37,7 +38,6 @@ def read_url(url, timeout):
return data
def get_range(from_, to):
base_url = 'https://www.imdb.com'
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
return url
@ -63,7 +63,7 @@ def get_film_count(year, month=None, day=None):
data = read_url(url, timeout=TIMEOUT)
total = re.compile('<span>1-50 of ([\d,]+?) titles.</span>').findall(data)
if not total:
total = re.compile(' ([\d,]+) titles\n', re.DOTALL).findall(data)
total = re.compile('<span>([\d,]+) titles.</span>', re.DOTALL).findall(data)
if total:
return int(total[0].replace(',', ''))
print('no movies', url)
@ -113,8 +113,6 @@ def update_month(year, month, film_counts):
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
r = update_ids(year, month, day, sort='alpha', expected=film_count)
save_film_counts(film_counts)
elif film_count > MAX_PER_RANGE:
print(key, '!!!to many per day', film_count, key)
else:
r = update_ids(year, month, day, expected=film_count)
save_film_counts(film_counts)
@ -168,9 +166,13 @@ def update_ids(year, month=None, day=None, sort=None, expexted=None):
while n:
start += step
next_link = 'start=%s&ref_=adv_nxt"' % (start)
after_link = 'href="(.*?after=.*?&ref_=adv_nxt)"'
has_next = re.compile(next_link).findall(data)
has_after = re.compile(after_link).findall(data)
if has_next:
n = '%s&start=%s' % (url, start)
elif sort != 'alpha' and start > MAX_PER_RANGE and has_after:
n = '%s%s' % (base_url, has_after[0])
else:
n = False
doc = lxml.html.fromstring(data)