From ca50d091a8ad4e0927ffb6f477fc8ef4a3d488f3 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Thu, 3 May 2018 15:35:02 +0200
Subject: [PATCH 1/4] get episode ids

---
 oxdata/movie/imdbids.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py
index b102742..6a1984e 100644
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@@ -46,6 +46,8 @@ def get_film_count(year, month=None, day=None):
         url = get_year(year)
     data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
     total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
+    if not total:
+        total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
     if total:
         return int(total[0].replace(',', ''))
     print('no movies', url)
@@ -150,9 +152,18 @@ def update_ids(year, month=None, day=None, sort=None):
                 print('no article on', '%s&page=%s' % (url, page-2))
                 break
             for header in article.find_class('lister-item-header'):
-                a = header.xpath('.//a')[0]
+                a = header.xpath('.//a')
+                if 'Episode:' in [
+                    e.text_content()
+                    for e in header.xpath(".//small")
+                ] and len(a) > 1:
+                    title = a[0].text_content().strip() + ': '
+                    a = a[1]
+                else:
+                    title = ''
+                    a = a[0]
                 id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
-                title = a.text_content().strip()
+                title += a.text_content().strip()
                 try:
                     y = header.find_class('lister-item-year')[0].text_content()
                     y = re.sub('\([^\d]+\)', '', y)

From e22d5c5ad0eae0ec83d20b374ed27dd8d5f9cbcd Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Sun, 3 Jun 2018 13:58:53 +0200
Subject: [PATCH 2/4] ignore

---
 oxdata/lookup/cache.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/oxdata/lookup/cache.py b/oxdata/lookup/cache.py
index f821f77..becc6e0 100644
--- a/oxdata/lookup/cache.py
+++ b/oxdata/lookup/cache.py
@@ -43,7 +43,7 @@ def get_ids():
                 print('missing impawards', ox.web.impawards.get_url(id))
 
     for id in ox.web.criterion.get_ids():
-        if id in ('626', '835'):
+        if id in ('626', '835', '1079', '28907'):
             continue
         if models.MovieId.objects.all().filter(criterion_id=id).count() == 0:
             print('criterion', id)

From 844d25008be2b3f451e892dd2f28dc37b5f2b8f3 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Sun, 3 Jun 2018 13:59:35 +0200
Subject: [PATCH 3/4] int or str

---
 oxdata/movie/views.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/oxdata/movie/views.py b/oxdata/movie/views.py
index c6a15f2..4bbbc52 100644
--- a/oxdata/movie/views.py
+++ b/oxdata/movie/views.py
@@ -28,6 +28,8 @@ actions.register(getIds)
 def getData(request, data):
     response = json_response()
     id = data['id']
+    if isinstance(id, int):
+        id = str(id)
     if len(id) == 7:
         i, created = models.Imdb.objects.get_or_create(imdb=id)
         if created:

From 501fe8cd3e1bd33b29eac6ffce85b4404eb46b61 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Sun, 3 Jun 2018 14:00:03 +0200
Subject: [PATCH 4/4] parse more info from list

---
 oxdata/movie/imdbids.py | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py
index 6a1984e..2cf36b0 100644
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@@ -116,6 +116,19 @@ def update_month(year, month, film_counts):
             print('%s: count %s, got ids %s' % (key, film_counts[key], r))
         save_film_counts(film_counts)
 
+def parse_cast(string):
+    results = {}
+    for part in string.split('|'):
+        cast = iter([t.strip() for t in part.split(':\n')])
+        cast = dict(zip(cast, cast))
+        for key in cast:
+            rkey = key.lower()
+            rkey = {
+                'director': 'directors',
+                'star': 'stars',
+            }.get(rkey, rkey)
+            results[rkey] = cast[key].split(', \n')
+    return results
 
 def update_ids(year, month=None, day=None, sort=None):
     films = {}
@@ -151,7 +164,8 @@ def update_ids(year, month=None, day=None, sort=None):
             else:
                 print('no article on', '%s&page=%s' % (url, page-2))
                 break
-            for header in article.find_class('lister-item-header'):
+            for content in article.find_class('lister-item-content'):
+                header = content.find_class('lister-item-header')[0]
                 a = header.xpath('.//a')
                 if 'Episode:' in [
                     e.text_content()
@@ -176,11 +190,34 @@ def update_ids(year, month=None, day=None, sort=None):
                     print(n)
                     print(header.find_class('lister-item-year')[0].text_content())
                     raise
+
+                text = content.xpath(".//p[contains(@class, 'text-muted')]")
+                plot = text[1].text_content().strip()
+                plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
+                if plot == 'Add a Plot':
+                    plot = ''
+                genre = content.find_class('genre')
+                if genre:
+                    genre = genre[0].text_content().strip().split(', ')
+                else:
+                    genre = []
+                cast = content.xpath(".//p[contains(@class, '')]")
+                cast = [t for t in cast if t.attrib.get('class') == '']
+                if cast:
+                    cast = parse_cast(cast[0].text_content())
+
                 if id not in films:
                     films[id] = {
                         'title': title,
                         'year': y
                     }
+                    if plot:
+                        films[id]['plot'] = plot
+                    if genre:
+                        films[id]['genre'] = genre
+                    if cast:
+                        films[id].update(cast)
+
             #print(key, len(films), 'films')
             if n:
                 #print(n)