From d5d45ad681daa053456e9f1c17bc77b5f44f8ef2 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 2 Aug 2019 14:35:44 +0200 Subject: [PATCH] fix imdb id scanner --- oxdata/movie/imdbids.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index d4f2955..2f7bac1 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -160,19 +160,20 @@ def update_ids(year, month=None, day=None, sort=None): for url in urls: data = read_url(url, timeout=TIMEOUT) n = True - page = 2 + step = 50 + start = 1 while n: n = re.compile('Next »', re.DOTALL).findall(data) if n: - n = '%s&page=%s' % (url, page) - page += 1 + n = '%s&start=%s' % (url, start) + start += step doc = lxml.html.fromstring(data) article = doc.find_class('article') if article: article = article[0] else: - print('no article on', '%s&page=%s' % (url, page-2)) - ox.web.imdb.delete_url('%s&page=%s' % (url, page-2)) + print('no article on', '%s&start=%s' % (url, start - 2*step)) + ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step)) break for content in article.find_class('lister-item-content'): header = content.find_class('lister-item-header')[0]