refactor

2019-08-05 13:18:39 +02:00 · 2019-08-05 13:18:39 +02:00 · 785550c753
commit 785550c753
parent ee9e430ef8
1 changed files with 70 additions and 76 deletions
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@ -110,12 +110,8 @@ def update_month(year, month, film_counts):
            if film_count != film_counts.get(key):
                print_info(key, film_count, film_counts)
                film_counts[key] = film_count
-                if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
-                    r = update_ids(year, month, day, sort='alpha', expected=film_count)
-                    save_film_counts(film_counts)
-                else:
-                    r = update_ids(year, month, day, expected=film_count)
-                    save_film_counts(film_counts)
+                r = update_ids(year, month, day, expected=film_count)
+                save_film_counts(film_counts)
        if days_total != month_total:
            print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
    else:
@ -136,6 +132,69 @@ def parse_cast(string):
            results[rkey] = cast[key].split(', \n')
    return results

+def get_films(data):
+    films = []
+    doc = lxml.html.fromstring(data)
+    article = doc.find_class('article')
+    if article:
+        article = article[0]
+    else:
+        return films
+    for content in article.find_class('lister-item-content'):
+        header = content.find_class('lister-item-header')[0]
+        a = header.xpath('.//a')
+        if 'Episode:' in [
+            e.text_content()
+            for e in header.xpath(".//small")
+        ] and len(a) > 1:
+            title = a[0].text_content().strip() + ': '
+            a = a[1]
+        else:
+            title = ''
+            a = a[0]
+        id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0]
+        title += a.text_content().strip()
+        try:
+            y = header.find_class('lister-item-year')[0].text_content()
+            y = re.sub('\([^\d]+\)', '', y)
+            y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
+            if not y:
+                y = year
+            else:
+                y = int(y)
+        except:
+            print(n)
+            print(header.find_class('lister-item-year')[0].text_content())
+            raise
+
+        text = content.xpath(".//p[contains(@class, 'text-muted')]")
+        plot = text[1].text_content().strip()
+        plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
+        if plot == 'Add a Plot':
+            plot = ''
+        genre = content.find_class('genre')
+        if genre:
+            genre = genre[0].text_content().strip().split(', ')
+        else:
+            genre = []
+        cast = content.xpath(".//p[contains(@class, '')]")
+        cast = [t for t in cast if t.attrib.get('class') == '']
+        if cast:
+            cast = parse_cast(cast[0].text_content())
+
+        film = {
+            'title': title,
+            'year': y
+        }
+        if plot:
+            film['plot'] = plot
+        if genre:
+            film['genre'] = genre
+        if cast:
+            film.update(cast)
+        films.append((id, film))
+    return films
+
 def update_ids(year, month=None, day=None, sort=None, expected=None):
    films = {}
    if day is not None:
@ -147,13 +206,8 @@ def update_ids(year, month=None, day=None, sort=None, expected=None):
    else:
        url = get_year(year)
        key = '%04d' % year
-    if sort == 'alpha':
-        urls = [
-            url.replace('sort=release_date,asc', 'sort=alpha,asc'),
-            url.replace('sort=release_date,asc', 'sort=alpha,desc'),
-        ]
-    else:
-        urls = [url]
+
+    urls = [url]

    if not expected:
        expected = get_film_count(year, month, day)
@ -171,73 +225,13 @@ def update_ids(year, month=None, day=None, sort=None, expected=None):
            has_after = re.compile(after_link).findall(data)
            if has_next:
                n = '%s&start=%s' % (url, start)
-            elif sort != 'alpha' and has_after:
+            elif has_after:
                n = '%s%s' % (base_url, has_after[0])
            else:
                n = False
-            doc = lxml.html.fromstring(data)
-            article = doc.find_class('article')
-            if article:
-                article = article[0]
-            else:
-                print('no article on', '%s&start=%s' % (url, start - 2*step))
-                ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step))
-                break
-            for content in article.find_class('lister-item-content'):
-                header = content.find_class('lister-item-header')[0]
-                a = header.xpath('.//a')
-                if 'Episode:' in [
-                    e.text_content()
-                    for e in header.xpath(".//small")
-                ] and len(a) > 1:
-                    title = a[0].text_content().strip() + ': '
-                    a = a[1]
-                else:
-                    title = ''
-                    a = a[0]
-                id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0]
-                title += a.text_content().strip()
-                try:
-                    y = header.find_class('lister-item-year')[0].text_content()
-                    y = re.sub('\([^\d]+\)', '', y)
-                    y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
-                    if not y:
-                        y = year
-                    else:
-                        y = int(y)
-                except:
-                    print(n)
-                    print(header.find_class('lister-item-year')[0].text_content())
-                    raise
-
-                text = content.xpath(".//p[contains(@class, 'text-muted')]")
-                plot = text[1].text_content().strip()
-                plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
-                if plot == 'Add a Plot':
-                    plot = ''
-                genre = content.find_class('genre')
-                if genre:
-                    genre = genre[0].text_content().strip().split(', ')
-                else:
-                    genre = []
-                cast = content.xpath(".//p[contains(@class, '')]")
-                cast = [t for t in cast if t.attrib.get('class') == '']
-                if cast:
-                    cast = parse_cast(cast[0].text_content())
-
+            for id, film in get_films(data):
                if id not in films:
-                    films[id] = {
-                        'title': title,
-                        'year': y
-                    }
-                    if plot:
-                        films[id]['plot'] = plot
-                    if genre:
-                        films[id]['genre'] = genre
-                    if cast:
-                        films[id].update(cast)
-            if expected and len(films) == expected and sort == 'alpha':
-                n = False
+                    films[id] = film
            debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n))
            if n:
                data = read_url(n, timeout=TIMEOUT)