From fe06a8c6645396f6c6149b896f9509e3c556ef7d Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Thu, 3 May 2018 12:11:51 +0200
Subject: [PATCH] one better

---
 oxdata/movie/imdbids.py | 110 ++++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 45 deletions(-)
diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py
index 671ea41..b102742 100644
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@@ -76,7 +76,9 @@ def update_year(year, film_counts):
                 film_counts[key] = film_count
                 update_month(year, month, film_counts)
     else:
-        update_ids(year)
+        r = update_ids(year)
+        if r != film_counts[key]:
+            print('%s: count %s, got ids %s' % (key, film_counts[key], r))
         save_film_counts(film_counts)
 
 def update_month(year, month, film_counts):
@@ -92,19 +94,28 @@ def update_month(year, month, film_counts):
             if film_count != film_counts.get(key):
                 print_info(key, film_count, film_counts)
                 film_counts[key] = film_count
-                if film_count > MAX_PER_RANGE:
+                if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
+                    r = update_ids(year, month, day, sort='alpha')
+                    if r != film_counts[key]:
+                        print('%s: count %s, got ids %s' % (key, film_counts[key], r))
+                    save_film_counts(film_counts)
+                elif film_count > MAX_PER_RANGE:
                     print(key, '!!!to many per day')
                 else:
-                    update_ids(year, month, day)
+                    r = update_ids(year, month, day)
+                    if r != film_counts[key]:
+                        print('%s: count %s, got ids %s' % (key, film_counts[key], r))
                     save_film_counts(film_counts)
         if days_total != month_total:
             print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
     else:
-        update_ids(year, month)
+        r = update_ids(year, month)
+        if r != film_counts[key]:
+            print('%s: count %s, got ids %s' % (key, film_counts[key], r))
         save_film_counts(film_counts)
 
 
-def update_ids(year, month=None, day=None):
+def update_ids(year, month=None, day=None, sort=None):
     films = {}
     if day is not None:
         url = get_day(year, month, day)
@@ -115,49 +126,58 @@ def update_ids(year, month=None, day=None):
     else:
         url = get_year(year)
         key = '%04d' % year
-    data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
-    n = True
-    page = 2
-    while n:
-        n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
-        if n:
-            n = '%s&page=%s' % (url, page)
-            page += 1
-        doc = lxml.html.fromstring(data)
-        article = doc.find_class('article')
-        if article:
-            article = article[0]
-        else:
-            print('no article on', '%s&page=%s' % (url, page-2))
-            break
-        for header in article.find_class('lister-item-header'):
-            a = header.xpath('.//a')[0]
-            id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
-            title = a.text_content().strip()
-            try:
-                y = header.find_class('lister-item-year')[0].text_content()
-                y = re.sub('\([^\d]+\)', '', y)
-                y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
-                if not y:
-                    y = year
-                else:
-                    y = int(y)
-            except:
-                print(n)
-                print(header.find_class('lister-item-year')[0].text_content())
-                raise
-            if id not in films:
-                films[id] = {
-                    'title': title,
-                    'year': y
-                }
-        #print(key, len(films), 'films')
-        if n:
-            #print(n)
-            data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
+    if sort == 'alpha':
+        urls = [
+            url.replace('sort=release_date,asc', 'sort=alpha,asc'),
+            url.replace('sort=release_date,asc', 'sort=alpha,desc'),
+        ]
+    else:
+        urls = [url]
+    for url in urls:
+        data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
+        n = True
+        page = 2
+        while n:
+            n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
+            if n:
+                n = '%s&page=%s' % (url, page)
+                page += 1
+            doc = lxml.html.fromstring(data)
+            article = doc.find_class('article')
+            if article:
+                article = article[0]
+            else:
+                print('no article on', '%s&page=%s' % (url, page-2))
+                break
+            for header in article.find_class('lister-item-header'):
+                a = header.xpath('.//a')[0]
+                id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
+                title = a.text_content().strip()
+                try:
+                    y = header.find_class('lister-item-year')[0].text_content()
+                    y = re.sub('\([^\d]+\)', '', y)
+                    y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
+                    if not y:
+                        y = year
+                    else:
+                        y = int(y)
+                except:
+                    print(n)
+                    print(header.find_class('lister-item-year')[0].text_content())
+                    raise
+                if id not in films:
+                    films[id] = {
+                        'title': title,
+                        'year': y
+                    }
+            #print(key, len(films), 'films')
+            if n:
+                #print(n)
+                data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
     path = get_path('ids/%s.json' % key)
     with open(path, 'w') as fd:
         json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
+    return len(films)
 
 def save_film_counts(film_counts):
     with open(get_path('film_counts.json'), 'w') as fd: