diff --git a/oxdata/lookup/cache.py b/oxdata/lookup/cache.py index becc6e0..f821f77 100644 --- a/oxdata/lookup/cache.py +++ b/oxdata/lookup/cache.py @@ -43,7 +43,7 @@ def get_ids(): print('missing impawards', ox.web.impawards.get_url(id)) for id in ox.web.criterion.get_ids(): - if id in ('626', '835', '1079', '28907'): + if id in ('626', '835'): continue if models.MovieId.objects.all().filter(criterion_id=id).count() == 0: print('criterion', id) diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index 2cf36b0..b102742 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -46,8 +46,6 @@ def get_film_count(year, month=None, day=None): url = get_year(year) data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) total = re.compile('50.*?of (.*?) titles', re.DOTALL).findall(data) - if not total: - total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data) if total: return int(total[0].replace(',', '')) print('no movies', url) @@ -116,19 +114,6 @@ def update_month(year, month, film_counts): print('%s: count %s, got ids %s' % (key, film_counts[key], r)) save_film_counts(film_counts) -def parse_cast(string): - results = {} - for part in string.split('|'): - cast = iter([t.strip() for t in part.split(':\n')]) - cast = dict(zip(cast, cast)) - for key in cast: - rkey = key.lower() - rkey = { - 'director': 'directors', - 'star': 'stars', - }.get(rkey, rkey) - results[rkey] = cast[key].split(', \n') - return results def update_ids(year, month=None, day=None, sort=None): films = {} @@ -164,20 +149,10 @@ def update_ids(year, month=None, day=None, sort=None): else: print('no article on', '%s&page=%s' % (url, page-2)) break - for content in article.find_class('lister-item-content'): - header = content.find_class('lister-item-header')[0] - a = header.xpath('.//a') - if 'Episode:' in [ - e.text_content() - for e in header.xpath(".//small") - ] and len(a) > 1: - title = a[0].text_content().strip() + ': ' - a = a[1] - else: - title = '' - a = a[0] + for header in article.find_class('lister-item-header'): + a = header.xpath('.//a')[0] id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] - title += a.text_content().strip() + title = a.text_content().strip() try: y = header.find_class('lister-item-year')[0].text_content() y = re.sub('\([^\d]+\)', '', y) @@ -190,34 +165,11 @@ def update_ids(year, month=None, day=None, sort=None): print(n) print(header.find_class('lister-item-year')[0].text_content()) raise - - text = content.xpath(".//p[contains(@class, 'text-muted')]") - plot = text[1].text_content().strip() - plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip() - if plot == 'Add a Plot': - plot = '' - genre = content.find_class('genre') - if genre: - genre = genre[0].text_content().strip().split(', ') - else: - genre = [] - cast = content.xpath(".//p[contains(@class, '')]") - cast = [t for t in cast if t.attrib.get('class') == ''] - if cast: - cast = parse_cast(cast[0].text_content()) - if id not in films: films[id] = { 'title': title, 'year': y } - if plot: - films[id]['plot'] = plot - if genre: - films[id]['genre'] = genre - if cast: - films[id].update(cast) - #print(key, len(films), 'films') if n: #print(n) diff --git a/oxdata/movie/views.py b/oxdata/movie/views.py index 4bbbc52..c6a15f2 100644 --- a/oxdata/movie/views.py +++ b/oxdata/movie/views.py @@ -28,8 +28,6 @@ actions.register(getIds) def getData(request, data): response = json_response() id = data['id'] - if isinstance(id, int): - id = str(id) if len(id) == 7: i, created = models.Imdb.objects.get_or_create(imdb=id) if created: