one better

2018-05-03 12:11:51 +02:00 · 2018-05-03 12:11:51 +02:00 · fe06a8c664
commit fe06a8c664
parent 02d415b5fa
1 changed files with 65 additions and 45 deletions
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@ -76,7 +76,9 @@ def update_year(year, film_counts):
                film_counts[key] = film_count
                update_month(year, month, film_counts)
    else:
-        update_ids(year)
+        r = update_ids(year)
+        if r != film_counts[key]:
+            print('%s: count %s, got ids %s' % (key, film_counts[key], r))
        save_film_counts(film_counts)

 def update_month(year, month, film_counts):
@ -92,19 +94,28 @@ def update_month(year, month, film_counts):
            if film_count != film_counts.get(key):
                print_info(key, film_count, film_counts)
                film_counts[key] = film_count
-                if film_count > MAX_PER_RANGE:
+                if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
+                    r = update_ids(year, month, day, sort='alpha')
+                    if r != film_counts[key]:
+                        print('%s: count %s, got ids %s' % (key, film_counts[key], r))
+                    save_film_counts(film_counts)
+                elif film_count > MAX_PER_RANGE:
                    print(key, '!!!to many per day')
                else:
-                    update_ids(year, month, day)
+                    r = update_ids(year, month, day)
+                    if r != film_counts[key]:
+                        print('%s: count %s, got ids %s' % (key, film_counts[key], r))
                    save_film_counts(film_counts)
        if days_total != month_total:
            print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
    else:
-        update_ids(year, month)
+        r = update_ids(year, month)
+        if r != film_counts[key]:
+            print('%s: count %s, got ids %s' % (key, film_counts[key], r))
        save_film_counts(film_counts)


-def update_ids(year, month=None, day=None):
+def update_ids(year, month=None, day=None, sort=None):
    films = {}
    if day is not None:
        url = get_day(year, month, day)
@ -115,49 +126,58 @@ def update_ids(year, month=None, day=None):
    else:
        url = get_year(year)
        key = '%04d' % year
-    data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
-    n = True
-    page = 2
-    while n:
-        n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
-        if n:
-            n = '%s&page=%s' % (url, page)
-            page += 1
-        doc = lxml.html.fromstring(data)
-        article = doc.find_class('article')
-        if article:
-            article = article[0]
-        else:
-            print('no article on', '%s&page=%s' % (url, page-2))
-            break
-        for header in article.find_class('lister-item-header'):
-            a = header.xpath('.//a')[0]
-            id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
-            title = a.text_content().strip()
-            try:
-                y = header.find_class('lister-item-year')[0].text_content()
-                y = re.sub('\([^\d]+\)', '', y)
-                y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
-                if not y:
-                    y = year
-                else:
-                    y = int(y)
-            except:
-                print(n)
-                print(header.find_class('lister-item-year')[0].text_content())
-                raise
-            if id not in films:
-                films[id] = {
-                    'title': title,
-                    'year': y
-                }
-        #print(key, len(films), 'films')
-        if n:
-            #print(n)
-            data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
+    if sort == 'alpha':
+        urls = [
+            url.replace('sort=release_date,asc', 'sort=alpha,asc'),
+            url.replace('sort=release_date,asc', 'sort=alpha,desc'),
+        ]
+    else:
+        urls = [url]
+    for url in urls:
+        data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
+        n = True
+        page = 2
+        while n:
+            n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
+            if n:
+                n = '%s&page=%s' % (url, page)
+                page += 1
+            doc = lxml.html.fromstring(data)
+            article = doc.find_class('article')
+            if article:
+                article = article[0]
+            else:
+                print('no article on', '%s&page=%s' % (url, page-2))
+                break
+            for header in article.find_class('lister-item-header'):
+                a = header.xpath('.//a')[0]
+                id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
+                title = a.text_content().strip()
+                try:
+                    y = header.find_class('lister-item-year')[0].text_content()
+                    y = re.sub('\([^\d]+\)', '', y)
+                    y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
+                    if not y:
+                        y = year
+                    else:
+                        y = int(y)
+                except:
+                    print(n)
+                    print(header.find_class('lister-item-year')[0].text_content())
+                    raise
+                if id not in films:
+                    films[id] = {
+                        'title': title,
+                        'year': y
+                    }
+            #print(key, len(films), 'films')
+            if n:
+                #print(n)
+                data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
    path = get_path('ids/%s.json' % key)
    with open(path, 'w') as fd:
        json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
+    return len(films)

 def save_film_counts(film_counts):
    with open(get_path('film_counts.json'), 'w') as fd: