From ca50d091a8ad4e0927ffb6f477fc8ef4a3d488f3 Mon Sep 17 00:00:00 2001 From: j Date: Thu, 3 May 2018 15:35:02 +0200 Subject: [PATCH] get episode ids --- oxdata/movie/imdbids.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index b102742..6a1984e 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -46,6 +46,8 @@ def get_film_count(year, month=None, day=None): url = get_year(year) data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) total = re.compile('50.*?of (.*?) titles', re.DOTALL).findall(data) + if not total: + total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data) if total: return int(total[0].replace(',', '')) print('no movies', url) @@ -150,9 +152,18 @@ def update_ids(year, month=None, day=None, sort=None): print('no article on', '%s&page=%s' % (url, page-2)) break for header in article.find_class('lister-item-header'): - a = header.xpath('.//a')[0] + a = header.xpath('.//a') + if 'Episode:' in [ + e.text_content() + for e in header.xpath(".//small") + ] and len(a) > 1: + title = a[0].text_content().strip() + ': ' + a = a[1] + else: + title = '' + a = a[0] id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] - title = a.text_content().strip() + title += a.text_content().strip() try: y = header.find_class('lister-item-year')[0].text_content() y = re.sub('\([^\d]+\)', '', y)