get episode ids

2018-05-03 15:35:02 +02:00 · 2018-05-03 15:35:02 +02:00 · ca50d091a8
commit ca50d091a8
parent fe06a8c664
1 changed files with 13 additions and 2 deletions
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@ -46,6 +46,8 @@ def get_film_count(year, month=None, day=None):
        url = get_year(year)
    data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
    total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
+    if not total:
+        total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
    if total:
        return int(total[0].replace(',', ''))
    print('no movies', url)
@ -150,9 +152,18 @@ def update_ids(year, month=None, day=None, sort=None):
                print('no article on', '%s&page=%s' % (url, page-2))
                break
            for header in article.find_class('lister-item-header'):
-                a = header.xpath('.//a')[0]
+                a = header.xpath('.//a')
+                if 'Episode:' in [
+                    e.text_content()
+                    for e in header.xpath(".//small")
+                ] and len(a) > 1:
+                    title = a[0].text_content().strip() + ': '
+                    a = a[1]
+                else:
+                    title = ''
+                    a = a[0]
                id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
-                title = a.text_content().strip()
+                title += a.text_content().strip()
                try:
                    y = header.find_class('lister-item-year')[0].text_content()
                    y = re.sub('\([^\d]+\)', '', y)