From ca50d091a8ad4e0927ffb6f477fc8ef4a3d488f3 Mon Sep 17 00:00:00 2001 From: j Date: Thu, 3 May 2018 15:35:02 +0200 Subject: [PATCH 1/4] get episode ids --- oxdata/movie/imdbids.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index b102742..6a1984e 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -46,6 +46,8 @@ def get_film_count(year, month=None, day=None): url = get_year(year) data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) total = re.compile('50.*?of (.*?) titles', re.DOTALL).findall(data) + if not total: + total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data) if total: return int(total[0].replace(',', '')) print('no movies', url) @@ -150,9 +152,18 @@ def update_ids(year, month=None, day=None, sort=None): print('no article on', '%s&page=%s' % (url, page-2)) break for header in article.find_class('lister-item-header'): - a = header.xpath('.//a')[0] + a = header.xpath('.//a') + if 'Episode:' in [ + e.text_content() + for e in header.xpath(".//small") + ] and len(a) > 1: + title = a[0].text_content().strip() + ': ' + a = a[1] + else: + title = '' + a = a[0] id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] - title = a.text_content().strip() + title += a.text_content().strip() try: y = header.find_class('lister-item-year')[0].text_content() y = re.sub('\([^\d]+\)', '', y) From e22d5c5ad0eae0ec83d20b374ed27dd8d5f9cbcd Mon Sep 17 00:00:00 2001 From: j Date: Sun, 3 Jun 2018 13:58:53 +0200 Subject: [PATCH 2/4] ignore --- oxdata/lookup/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oxdata/lookup/cache.py b/oxdata/lookup/cache.py index f821f77..becc6e0 100644 --- a/oxdata/lookup/cache.py +++ b/oxdata/lookup/cache.py @@ -43,7 +43,7 @@ def get_ids(): print('missing impawards', ox.web.impawards.get_url(id)) for id in ox.web.criterion.get_ids(): - if id in ('626', '835'): + if id in ('626', '835', '1079', '28907'): continue if models.MovieId.objects.all().filter(criterion_id=id).count() == 0: print('criterion', id) From 844d25008be2b3f451e892dd2f28dc37b5f2b8f3 Mon Sep 17 00:00:00 2001 From: j Date: Sun, 3 Jun 2018 13:59:35 +0200 Subject: [PATCH 3/4] int or str --- oxdata/movie/views.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/oxdata/movie/views.py b/oxdata/movie/views.py index c6a15f2..4bbbc52 100644 --- a/oxdata/movie/views.py +++ b/oxdata/movie/views.py @@ -28,6 +28,8 @@ actions.register(getIds) def getData(request, data): response = json_response() id = data['id'] + if isinstance(id, int): + id = str(id) if len(id) == 7: i, created = models.Imdb.objects.get_or_create(imdb=id) if created: From 501fe8cd3e1bd33b29eac6ffce85b4404eb46b61 Mon Sep 17 00:00:00 2001 From: j Date: Sun, 3 Jun 2018 14:00:03 +0200 Subject: [PATCH 4/4] parse more info from list --- oxdata/movie/imdbids.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index 6a1984e..2cf36b0 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -116,6 +116,19 @@ def update_month(year, month, film_counts): print('%s: count %s, got ids %s' % (key, film_counts[key], r)) save_film_counts(film_counts) +def parse_cast(string): + results = {} + for part in string.split('|'): + cast = iter([t.strip() for t in part.split(':\n')]) + cast = dict(zip(cast, cast)) + for key in cast: + rkey = key.lower() + rkey = { + 'director': 'directors', + 'star': 'stars', + }.get(rkey, rkey) + results[rkey] = cast[key].split(', \n') + return results def update_ids(year, month=None, day=None, sort=None): films = {} @@ -151,7 +164,8 @@ def update_ids(year, month=None, day=None, sort=None): else: print('no article on', '%s&page=%s' % (url, page-2)) break - for header in article.find_class('lister-item-header'): + for content in article.find_class('lister-item-content'): + header = content.find_class('lister-item-header')[0] a = header.xpath('.//a') if 'Episode:' in [ e.text_content() @@ -176,11 +190,34 @@ def update_ids(year, month=None, day=None, sort=None): print(n) print(header.find_class('lister-item-year')[0].text_content()) raise + + text = content.xpath(".//p[contains(@class, 'text-muted')]") + plot = text[1].text_content().strip() + plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip() + if plot == 'Add a Plot': + plot = '' + genre = content.find_class('genre') + if genre: + genre = genre[0].text_content().strip().split(', ') + else: + genre = [] + cast = content.xpath(".//p[contains(@class, '')]") + cast = [t for t in cast if t.attrib.get('class') == ''] + if cast: + cast = parse_cast(cast[0].text_content()) + if id not in films: films[id] = { 'title': title, 'year': y } + if plot: + films[id]['plot'] = plot + if genre: + films[id]['genre'] = genre + if cast: + films[id].update(cast) + #print(key, len(films), 'films') if n: #print(n)