refactor

2019-08-05 13:18:39 +02:00 · 2019-08-05 13:18:39 +02:00 · 785550c753
commit 785550c753
parent ee9e430ef8
1 changed files with 70 additions and 76 deletions
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@ -110,12 +110,8 @@ def update_month(year, month, film_counts):
            if film_count != film_counts.get(key):
                print_info(key, film_count, film_counts)
                film_counts[key] = film_count
-                if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
+                r = update_ids(year, month, day, expected=film_count)
-                    r = update_ids(year, month, day, sort='alpha', expected=film_count)
+                save_film_counts(film_counts)
                    save_film_counts(film_counts)
                else:
                    r = update_ids(year, month, day, expected=film_count)
                    save_film_counts(film_counts)
        if days_total != month_total:
            print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
    else:
@ -136,6 +132,69 @@ def parse_cast(string):
            results[rkey] = cast[key].split(', \n')
    return results
 def get_films(data):
    films = []
    doc = lxml.html.fromstring(data)
    article = doc.find_class('article')
    if article:
        article = article[0]
    else:
        return films
    for content in article.find_class('lister-item-content'):
        header = content.find_class('lister-item-header')[0]
        a = header.xpath('.//a')
        if 'Episode:' in [
            e.text_content()
            for e in header.xpath(".//small")
        ] and len(a) > 1:
            title = a[0].text_content().strip() + ': '
            a = a[1]
        else:
            title = ''
            a = a[0]
        id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0]
        title += a.text_content().strip()
        try:
            y = header.find_class('lister-item-year')[0].text_content()
            y = re.sub('\([^\d]+\)', '', y)
            y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
            if not y:
                y = year
            else:
                y = int(y)
        except:
            print(n)
            print(header.find_class('lister-item-year')[0].text_content())
            raise
        text = content.xpath(".//p[contains(@class, 'text-muted')]")
        plot = text[1].text_content().strip()
        plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
        if plot == 'Add a Plot':
            plot = ''
        genre = content.find_class('genre')
        if genre:
            genre = genre[0].text_content().strip().split(', ')
        else:
            genre = []
        cast = content.xpath(".//p[contains(@class, '')]")
        cast = [t for t in cast if t.attrib.get('class') == '']
        if cast:
            cast = parse_cast(cast[0].text_content())
        film = {
            'title': title,
            'year': y
        }
        if plot:
            film['plot'] = plot
        if genre:
            film['genre'] = genre
        if cast:
            film.update(cast)
        films.append((id, film))
    return films
 def update_ids(year, month=None, day=None, sort=None, expected=None):
    films = {}
    if day is not None:
@ -147,13 +206,8 @@ def update_ids(year, month=None, day=None, sort=None, expected=None):
    else:
        url = get_year(year)
        key = '%04d' % year
-    if sort == 'alpha':
+
-        urls = [
+    urls = [url]
            url.replace('sort=release_date,asc', 'sort=alpha,asc'),
            url.replace('sort=release_date,asc', 'sort=alpha,desc'),
        ]
    else:
        urls = [url]
    if not expected:
        expected = get_film_count(year, month, day)
@ -171,73 +225,13 @@ def update_ids(year, month=None, day=None, sort=None, expected=None):
            has_after = re.compile(after_link).findall(data)
            if has_next:
                n = '%s&start=%s' % (url, start)
-            elif sort != 'alpha' and has_after:
+            elif has_after:
                n = '%s%s' % (base_url, has_after[0])
            else:
                n = False
-            doc = lxml.html.fromstring(data)
+            for id, film in get_films(data):
            article = doc.find_class('article')
            if article:
                article = article[0]
            else:
                print('no article on', '%s&start=%s' % (url, start - 2*step))
                ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step))
                break
            for content in article.find_class('lister-item-content'):
                header = content.find_class('lister-item-header')[0]
                a = header.xpath('.//a')
                if 'Episode:' in [
                    e.text_content()
                    for e in header.xpath(".//small")
                ] and len(a) > 1:
                    title = a[0].text_content().strip() + ': '
                    a = a[1]
                else:
                    title = ''
                    a = a[0]
                id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0]
                title += a.text_content().strip()
                try:
                    y = header.find_class('lister-item-year')[0].text_content()
                    y = re.sub('\([^\d]+\)', '', y)
                    y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
                    if not y:
                        y = year
                    else:
                        y = int(y)
                except:
                    print(n)
                    print(header.find_class('lister-item-year')[0].text_content())
                    raise
                text = content.xpath(".//p[contains(@class, 'text-muted')]")
                plot = text[1].text_content().strip()
                plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
                if plot == 'Add a Plot':
                    plot = ''
                genre = content.find_class('genre')
                if genre:
                    genre = genre[0].text_content().strip().split(', ')
                else:
                    genre = []
                cast = content.xpath(".//p[contains(@class, '')]")
                cast = [t for t in cast if t.attrib.get('class') == '']
                if cast:
                    cast = parse_cast(cast[0].text_content())
                if id not in films:
-                    films[id] = {
+                    films[id] = film
                        'title': title,
                        'year': y
                    }
                    if plot:
                        films[id]['plot'] = plot
                    if genre:
                        films[id]['genre'] = genre
                    if cast:
                        films[id].update(cast)
            if expected and len(films) == expected and sort == 'alpha':
                n = False
            debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n))
            if n:
                data = read_url(n, timeout=TIMEOUT)