fix imdb id scanner
This commit is contained in:
parent
6b12cf24af
commit
d5d45ad681
1 changed files with 6 additions and 5 deletions
|
@ -160,19 +160,20 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
for url in urls:
|
for url in urls:
|
||||||
data = read_url(url, timeout=TIMEOUT)
|
data = read_url(url, timeout=TIMEOUT)
|
||||||
n = True
|
n = True
|
||||||
page = 2
|
step = 50
|
||||||
|
start = 1
|
||||||
while n:
|
while n:
|
||||||
n = re.compile('Next »</a>', re.DOTALL).findall(data)
|
n = re.compile('Next »</a>', re.DOTALL).findall(data)
|
||||||
if n:
|
if n:
|
||||||
n = '%s&page=%s' % (url, page)
|
n = '%s&start=%s' % (url, start)
|
||||||
page += 1
|
start += step
|
||||||
doc = lxml.html.fromstring(data)
|
doc = lxml.html.fromstring(data)
|
||||||
article = doc.find_class('article')
|
article = doc.find_class('article')
|
||||||
if article:
|
if article:
|
||||||
article = article[0]
|
article = article[0]
|
||||||
else:
|
else:
|
||||||
print('no article on', '%s&page=%s' % (url, page-2))
|
print('no article on', '%s&start=%s' % (url, start - 2*step))
|
||||||
ox.web.imdb.delete_url('%s&page=%s' % (url, page-2))
|
ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step))
|
||||||
break
|
break
|
||||||
for content in article.find_class('lister-item-content'):
|
for content in article.find_class('lister-item-content'):
|
||||||
header = content.find_class('lister-item-header')[0]
|
header = content.find_class('lister-item-header')[0]
|
||||||
|
|
Loading…
Reference in a new issue