fix imdb id scanner
This commit is contained in:
parent
6b12cf24af
commit
d5d45ad681
1 changed files with 6 additions and 5 deletions
|
@ -160,19 +160,20 @@ def update_ids(year, month=None, day=None, sort=None):
|
|||
for url in urls:
|
||||
data = read_url(url, timeout=TIMEOUT)
|
||||
n = True
|
||||
page = 2
|
||||
step = 50
|
||||
start = 1
|
||||
while n:
|
||||
n = re.compile('Next »</a>', re.DOTALL).findall(data)
|
||||
if n:
|
||||
n = '%s&page=%s' % (url, page)
|
||||
page += 1
|
||||
n = '%s&start=%s' % (url, start)
|
||||
start += step
|
||||
doc = lxml.html.fromstring(data)
|
||||
article = doc.find_class('article')
|
||||
if article:
|
||||
article = article[0]
|
||||
else:
|
||||
print('no article on', '%s&page=%s' % (url, page-2))
|
||||
ox.web.imdb.delete_url('%s&page=%s' % (url, page-2))
|
||||
print('no article on', '%s&start=%s' % (url, start - 2*step))
|
||||
ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step))
|
||||
break
|
||||
for content in article.find_class('lister-item-content'):
|
||||
header = content.find_class('lister-item-header')[0]
|
||||
|
|
Loading…
Reference in a new issue