From d21b0b675cea8e32d0c59d8284e5483a98be4c94 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 9 Jul 2018 15:20:00 +0200 Subject: [PATCH 1/2] criterion is https now --- ox/web/criterion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ox/web/criterion.py b/ox/web/criterion.py index d687b9b..6cef01e 100644 --- a/ox/web/criterion.py +++ b/ox/web/criterion.py @@ -14,7 +14,7 @@ def get_id(url): return url.split("/")[-1] def get_url(id): - return "http://www.criterion.com/films/%s" % id + return "https://www.criterion.com/films/%s" % id def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): ''' @@ -67,7 +67,7 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): data["posters"] = [result] else: html_ = read_url(result, unicode=True) - result = find_re(html_, '(.*?)' % id) + result = find_re(html_, '//www.criterion.com/films/%s.*?">(.*?)' % id) result = find_re(result, "src=\"(.*?)\"") if result: data["posters"] = [result.replace("_w100", "")] @@ -102,7 +102,7 @@ def get_ids(page=None): ids += results results = re.compile("boxsets/(.*?)\"").findall(html) for result in results: - html = read_url("http://www.criterion.com/boxsets/" + result, unicode=True) + html = read_url("https://www.criterion.com/boxsets/" + result, unicode=True) results = re.compile("films/(\d+)-").findall(html) ids += results return sorted(set(ids), key=int) From 7041d1b31610a6e9ce37273a83006b376f0f4b2a Mon Sep 17 00:00:00 2001 From: j Date: Mon, 9 Jul 2018 15:20:13 +0200 Subject: [PATCH 2/2] parse lyricist, singer --- ox/web/imdb.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index b28bf2b..57b2423 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -512,14 +512,23 @@ class Imdb(SiteParser): credits = [c for c in credits if c[1]] self['credits'] = [] + self['lyricist'] = [] + self['singer'] = [] for department, crew in credits: department = department.replace('(in alphabetical order)', '').strip() for c in crew: + name = c[0] + roles = c[1] self['credits'].append({ - 'name': c[0], - 'roles': c[1], + 'name': name, + 'roles': roles, 'deparment': department }) + if department == 'Music Department': + if 'lyricist' in roles: + self['lyricist'].append(name) + if 'playback singer' in roles: + self['singer'].append(name) if not self['credits']: del self['credits']