get episode ids
This commit is contained in:
parent
fe06a8c664
commit
ca50d091a8
1 changed files with 13 additions and 2 deletions
|
@ -46,6 +46,8 @@ def get_film_count(year, month=None, day=None):
|
|||
url = get_year(year)
|
||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
||||
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
||||
if not total:
|
||||
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
|
||||
if total:
|
||||
return int(total[0].replace(',', ''))
|
||||
print('no movies', url)
|
||||
|
@ -150,9 +152,18 @@ def update_ids(year, month=None, day=None, sort=None):
|
|||
print('no article on', '%s&page=%s' % (url, page-2))
|
||||
break
|
||||
for header in article.find_class('lister-item-header'):
|
||||
a = header.xpath('.//a')[0]
|
||||
a = header.xpath('.//a')
|
||||
if 'Episode:' in [
|
||||
e.text_content()
|
||||
for e in header.xpath(".//small")
|
||||
] and len(a) > 1:
|
||||
title = a[0].text_content().strip() + ': '
|
||||
a = a[1]
|
||||
else:
|
||||
title = ''
|
||||
a = a[0]
|
||||
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
||||
title = a.text_content().strip()
|
||||
title += a.text_content().strip()
|
||||
try:
|
||||
y = header.find_class('lister-item-year')[0].text_content()
|
||||
y = re.sub('\([^\d]+\)', '', y)
|
||||
|
|
Loading…
Reference in a new issue