diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index 17144a1..d8fe391 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -110,12 +110,8 @@ def update_month(year, month, film_counts): if film_count != film_counts.get(key): print_info(key, film_count, film_counts) film_counts[key] = film_count - if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE: - r = update_ids(year, month, day, sort='alpha', expected=film_count) - save_film_counts(film_counts) - else: - r = update_ids(year, month, day, expected=film_count) - save_film_counts(film_counts) + r = update_ids(year, month, day, expected=film_count) + save_film_counts(film_counts) if days_total != month_total: print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total)) else: @@ -136,6 +132,69 @@ def parse_cast(string): results[rkey] = cast[key].split(', \n') return results +def get_films(data): + films = [] + doc = lxml.html.fromstring(data) + article = doc.find_class('article') + if article: + article = article[0] + else: + return films + for content in article.find_class('lister-item-content'): + header = content.find_class('lister-item-header')[0] + a = header.xpath('.//a') + if 'Episode:' in [ + e.text_content() + for e in header.xpath(".//small") + ] and len(a) > 1: + title = a[0].text_content().strip() + ': ' + a = a[1] + else: + title = '' + a = a[0] + id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0] + title += a.text_content().strip() + try: + y = header.find_class('lister-item-year')[0].text_content() + y = re.sub('\([^\d]+\)', '', y) + y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip() + if not y: + y = year + else: + y = int(y) + except: + print(n) + print(header.find_class('lister-item-year')[0].text_content()) + raise + + text = content.xpath(".//p[contains(@class, 'text-muted')]") + plot = text[1].text_content().strip() + plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip() + if plot == 'Add a Plot': + plot = '' + genre = content.find_class('genre') + if genre: + genre = genre[0].text_content().strip().split(', ') + else: + genre = [] + cast = content.xpath(".//p[contains(@class, '')]") + cast = [t for t in cast if t.attrib.get('class') == ''] + if cast: + cast = parse_cast(cast[0].text_content()) + + film = { + 'title': title, + 'year': y + } + if plot: + film['plot'] = plot + if genre: + film['genre'] = genre + if cast: + film.update(cast) + films.append((id, film)) + return films + def update_ids(year, month=None, day=None, sort=None, expected=None): films = {} if day is not None: @@ -147,13 +206,8 @@ def update_ids(year, month=None, day=None, sort=None, expected=None): else: url = get_year(year) key = '%04d' % year - if sort == 'alpha': - urls = [ - url.replace('sort=release_date,asc', 'sort=alpha,asc'), - url.replace('sort=release_date,asc', 'sort=alpha,desc'), - ] - else: - urls = [url] + + urls = [url] if not expected: expected = get_film_count(year, month, day) @@ -171,73 +225,13 @@ def update_ids(year, month=None, day=None, sort=None, expected=None): has_after = re.compile(after_link).findall(data) if has_next: n = '%s&start=%s' % (url, start) - elif sort != 'alpha' and has_after: + elif has_after: n = '%s%s' % (base_url, has_after[0]) else: n = False - doc = lxml.html.fromstring(data) - article = doc.find_class('article') - if article: - article = article[0] - else: - print('no article on', '%s&start=%s' % (url, start - 2*step)) - ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step)) - break - for content in article.find_class('lister-item-content'): - header = content.find_class('lister-item-header')[0] - a = header.xpath('.//a') - if 'Episode:' in [ - e.text_content() - for e in header.xpath(".//small") - ] and len(a) > 1: - title = a[0].text_content().strip() + ': ' - a = a[1] - else: - title = '' - a = a[0] - id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0] - title += a.text_content().strip() - try: - y = header.find_class('lister-item-year')[0].text_content() - y = re.sub('\([^\d]+\)', '', y) - y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip() - if not y: - y = year - else: - y = int(y) - except: - print(n) - print(header.find_class('lister-item-year')[0].text_content()) - raise - - text = content.xpath(".//p[contains(@class, 'text-muted')]") - plot = text[1].text_content().strip() - plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip() - if plot == 'Add a Plot': - plot = '' - genre = content.find_class('genre') - if genre: - genre = genre[0].text_content().strip().split(', ') - else: - genre = [] - cast = content.xpath(".//p[contains(@class, '')]") - cast = [t for t in cast if t.attrib.get('class') == ''] - if cast: - cast = parse_cast(cast[0].text_content()) - + for id, film in get_films(data): if id not in films: - films[id] = { - 'title': title, - 'year': y - } - if plot: - films[id]['plot'] = plot - if genre: - films[id]['genre'] = genre - if cast: - films[id].update(cast) - if expected and len(films) == expected and sort == 'alpha': - n = False + films[id] = film debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n)) if n: data = read_url(n, timeout=TIMEOUT)