diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index d8fe391..94b8453 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -110,8 +110,12 @@ def update_month(year, month, film_counts): if film_count != film_counts.get(key): print_info(key, film_count, film_counts) film_counts[key] = film_count - r = update_ids(year, month, day, expected=film_count) - save_film_counts(film_counts) + if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE: + r = update_ids(year, month, day, sort='alpha', expected=film_count) + save_film_counts(film_counts) + else: + r = update_ids(year, month, day, expected=film_count) + save_film_counts(film_counts) if days_total != month_total: print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total)) else: @@ -132,70 +136,7 @@ def parse_cast(string): results[rkey] = cast[key].split(', \n') return results -def get_films(data): - films = [] - doc = lxml.html.fromstring(data) - article = doc.find_class('article') - if article: - article = article[0] - else: - return films - for content in article.find_class('lister-item-content'): - header = content.find_class('lister-item-header')[0] - a = header.xpath('.//a') - if 'Episode:' in [ - e.text_content() - for e in header.xpath(".//small") - ] and len(a) > 1: - title = a[0].text_content().strip() + ': ' - a = a[1] - else: - title = '' - a = a[0] - id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0] - title += a.text_content().strip() - try: - y = header.find_class('lister-item-year')[0].text_content() - y = re.sub('\([^\d]+\)', '', y) - y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip() - if not y: - y = year - else: - y = int(y) - except: - print(n) - print(header.find_class('lister-item-year')[0].text_content()) - raise - - text = content.xpath(".//p[contains(@class, 'text-muted')]") - plot = text[1].text_content().strip() - plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip() - if plot == 'Add a Plot': - plot = '' - genre = content.find_class('genre') - if genre: - genre = genre[0].text_content().strip().split(', ') - else: - genre = [] - cast = content.xpath(".//p[contains(@class, '')]") - cast = [t for t in cast if t.attrib.get('class') == ''] - if cast: - cast = parse_cast(cast[0].text_content()) - - film = { - 'title': title, - 'year': y - } - if plot: - film['plot'] = plot - if genre: - film['genre'] = genre - if cast: - film.update(cast) - films.append((id, film)) - return films - -def update_ids(year, month=None, day=None, sort=None, expected=None): +def update_ids(year, month=None, day=None, sort=None, expexted=None): films = {} if day is not None: url = get_day(year, month, day) @@ -206,11 +147,16 @@ def update_ids(year, month=None, day=None, sort=None, expected=None): else: url = get_year(year) key = '%04d' % year + if sort == 'alpha': + urls = [ + url.replace('sort=release_date,asc', 'sort=alpha,asc'), + url.replace('sort=release_date,asc', 'sort=alpha,desc'), + ] + else: + urls = [url] - urls = [url] - - if not expected: - expected = get_film_count(year, month, day) + if not expexted: + expexted = get_film_count(year, month, day) for url in urls: data = read_url(url, timeout=TIMEOUT) @@ -225,13 +171,73 @@ def update_ids(year, month=None, day=None, sort=None, expected=None): has_after = re.compile(after_link).findall(data) if has_next: n = '%s&start=%s' % (url, start) - elif has_after: + elif sort != 'alpha' and start > MAX_PER_RANGE and has_after: n = '%s%s' % (base_url, has_after[0]) else: n = False - for id, film in get_films(data): + doc = lxml.html.fromstring(data) + article = doc.find_class('article') + if article: + article = article[0] + else: + print('no article on', '%s&start=%s' % (url, start - 2*step)) + ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step)) + break + for content in article.find_class('lister-item-content'): + header = content.find_class('lister-item-header')[0] + a = header.xpath('.//a') + if 'Episode:' in [ + e.text_content() + for e in header.xpath(".//small") + ] and len(a) > 1: + title = a[0].text_content().strip() + ': ' + a = a[1] + else: + title = '' + a = a[0] + id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0] + title += a.text_content().strip() + try: + y = header.find_class('lister-item-year')[0].text_content() + y = re.sub('\([^\d]+\)', '', y) + y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip() + if not y: + y = year + else: + y = int(y) + except: + print(n) + print(header.find_class('lister-item-year')[0].text_content()) + raise + + text = content.xpath(".//p[contains(@class, 'text-muted')]") + plot = text[1].text_content().strip() + plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip() + if plot == 'Add a Plot': + plot = '' + genre = content.find_class('genre') + if genre: + genre = genre[0].text_content().strip().split(', ') + else: + genre = [] + cast = content.xpath(".//p[contains(@class, '')]") + cast = [t for t in cast if t.attrib.get('class') == ''] + if cast: + cast = parse_cast(cast[0].text_content()) + if id not in films: - films[id] = film + films[id] = { + 'title': title, + 'year': y + } + if plot: + films[id]['plot'] = plot + if genre: + films[id]['genre'] = genre + if cast: + films[id].update(cast) + if expected and len(films) == expected and sort == 'alpha': + n = False debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n)) if n: data = read_url(n, timeout=TIMEOUT) @@ -240,7 +246,7 @@ def update_ids(year, month=None, day=None, sort=None, expected=None): json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True) r = len(films) if r != expected: - print('%s: got %s, expected %s' % (key, r, expected)) + print('%s: got %s, expected %s' % (key, r, expexted)) return r def save_film_counts(film_counts): diff --git a/oxdata/movie/management/commands/cache_imdb_ids.py b/oxdata/movie/management/commands/cache_imdb_ids.py index bbf1edb..f96a73e 100644 --- a/oxdata/movie/management/commands/cache_imdb_ids.py +++ b/oxdata/movie/management/commands/cache_imdb_ids.py @@ -3,7 +3,6 @@ from django.core.management.base import BaseCommand import movie.models -import movie.imdbids class Command(BaseCommand): """ @@ -12,12 +11,6 @@ class Command(BaseCommand): help = 'load ids from sites that dont support search.' args = '' - def add_arguments(self, parser): - parser.add_argument('--debug', action='store_true', dest='debug', default=False, help='print debug info') - def handle(self, **options): timeout = 30*24*60*60 - if options.get('debug'): - movie.imdbids.DEBUG = True - movie.models.get_new_ids(timeout)