continue after
This commit is contained in:
parent
d014fb7bca
commit
a131c78482
1 changed files with 5 additions and 3 deletions
|
@ -22,6 +22,7 @@ TIMEOUT = 90 * DAY
|
||||||
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
|
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
|
base_url = 'https://www.imdb.com'
|
||||||
|
|
||||||
def debug(*args, **kwargs):
|
def debug(*args, **kwargs):
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
|
@ -37,7 +38,6 @@ def read_url(url, timeout):
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_range(from_, to):
|
def get_range(from_, to):
|
||||||
base_url = 'https://www.imdb.com'
|
|
||||||
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
|
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
@ -113,8 +113,6 @@ def update_month(year, month, film_counts):
|
||||||
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
|
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
|
||||||
r = update_ids(year, month, day, sort='alpha', expected=film_count)
|
r = update_ids(year, month, day, sort='alpha', expected=film_count)
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
elif film_count > MAX_PER_RANGE:
|
|
||||||
print(key, '!!!to many per day', film_count, key)
|
|
||||||
else:
|
else:
|
||||||
r = update_ids(year, month, day, expected=film_count)
|
r = update_ids(year, month, day, expected=film_count)
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
|
@ -168,9 +166,13 @@ def update_ids(year, month=None, day=None, sort=None, expexted=None):
|
||||||
while n:
|
while n:
|
||||||
start += step
|
start += step
|
||||||
next_link = 'start=%s&ref_=adv_nxt"' % (start)
|
next_link = 'start=%s&ref_=adv_nxt"' % (start)
|
||||||
|
after_link = 'href="(.*?after=.*?&ref_=adv_nxt)"'
|
||||||
has_next = re.compile(next_link).findall(data)
|
has_next = re.compile(next_link).findall(data)
|
||||||
|
has_after = re.compile(after_link).findall(data)
|
||||||
if has_next:
|
if has_next:
|
||||||
n = '%s&start=%s' % (url, start)
|
n = '%s&start=%s' % (url, start)
|
||||||
|
elif sort != 'alpha' and start > MAX_PER_RANGE and has_after:
|
||||||
|
n = '%s%s' % (base_url, has_after[0])
|
||||||
else:
|
else:
|
||||||
n = False
|
n = False
|
||||||
doc = lxml.html.fromstring(data)
|
doc = lxml.html.fromstring(data)
|
||||||
|
|
Loading…
Reference in a new issue