get episode ids
This commit is contained in:
parent
fe06a8c664
commit
ca50d091a8
1 changed files with 13 additions and 2 deletions
|
@ -46,6 +46,8 @@ def get_film_count(year, month=None, day=None):
|
||||||
url = get_year(year)
|
url = get_year(year)
|
||||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
||||||
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
||||||
|
if not total:
|
||||||
|
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
|
||||||
if total:
|
if total:
|
||||||
return int(total[0].replace(',', ''))
|
return int(total[0].replace(',', ''))
|
||||||
print('no movies', url)
|
print('no movies', url)
|
||||||
|
@ -150,9 +152,18 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
print('no article on', '%s&page=%s' % (url, page-2))
|
print('no article on', '%s&page=%s' % (url, page-2))
|
||||||
break
|
break
|
||||||
for header in article.find_class('lister-item-header'):
|
for header in article.find_class('lister-item-header'):
|
||||||
a = header.xpath('.//a')[0]
|
a = header.xpath('.//a')
|
||||||
|
if 'Episode:' in [
|
||||||
|
e.text_content()
|
||||||
|
for e in header.xpath(".//small")
|
||||||
|
] and len(a) > 1:
|
||||||
|
title = a[0].text_content().strip() + ': '
|
||||||
|
a = a[1]
|
||||||
|
else:
|
||||||
|
title = ''
|
||||||
|
a = a[0]
|
||||||
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
||||||
title = a.text_content().strip()
|
title += a.text_content().strip()
|
||||||
try:
|
try:
|
||||||
y = header.find_class('lister-item-year')[0].text_content()
|
y = header.find_class('lister-item-year')[0].text_content()
|
||||||
y = re.sub('\([^\d]+\)', '', y)
|
y = re.sub('\([^\d]+\)', '', y)
|
||||||
|
|
Loading…
Reference in a new issue