get episode ids

This commit is contained in:
j 2018-05-03 15:35:02 +02:00
parent fe06a8c664
commit ca50d091a8
1 changed files with 13 additions and 2 deletions

View File

@ -46,6 +46,8 @@ def get_film_count(year, month=None, day=None):
url = get_year(year)
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
if not total:
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
if total:
return int(total[0].replace(',', ''))
print('no movies', url)
@ -150,9 +152,18 @@ def update_ids(year, month=None, day=None, sort=None):
print('no article on', '%s&page=%s' % (url, page-2))
break
for header in article.find_class('lister-item-header'):
a = header.xpath('.//a')[0]
a = header.xpath('.//a')
if 'Episode:' in [
e.text_content()
for e in header.xpath(".//small")
] and len(a) > 1:
title = a[0].text_content().strip() + ': '
a = a[1]
else:
title = ''
a = a[0]
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
title = a.text_content().strip()
title += a.text_content().strip()
try:
y = header.find_class('lister-item-year')[0].text_content()
y = re.sub('\([^\d]+\)', '', y)