parse lyricist, singer

criterion is https now
2018-07-09 15:20:13 +02:00 · 2018-07-09 15:20:00 +02:00
2 changed files with 14 additions and 5 deletions
--- a/ox/web/criterion.py
+++ b/ox/web/criterion.py
@ -14,7 +14,7 @@ def get_id(url):
    return url.split("/")[-1]
 def get_url(id):
-    return "http://www.criterion.com/films/%s" % id
+    return "https://www.criterion.com/films/%s" % id
 def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
    '''
@ -67,7 +67,7 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
        data["posters"] = [result]
    else:
        html_ = read_url(result, unicode=True)
-        result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
+        result = find_re(html_, '//www.criterion.com/films/%s.*?">(.*?)</a>' % id)
        result = find_re(result, "src=\"(.*?)\"")
        if result:
            data["posters"] = [result.replace("_w100", "")]
@ -102,7 +102,7 @@ def get_ids(page=None):
    ids += results
    results = re.compile("boxsets/(.*?)\"").findall(html)
    for result in results:
-        html = read_url("http://www.criterion.com/boxsets/" + result, unicode=True)
+        html = read_url("https://www.criterion.com/boxsets/" + result, unicode=True)
        results = re.compile("films/(\d+)-").findall(html)
        ids += results
    return sorted(set(ids), key=int)
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -512,14 +512,23 @@ class Imdb(SiteParser):
            credits = [c for c in credits if c[1]]
            self['credits'] = []
            self['lyricist'] = []
            self['singer'] = []
            for department, crew in credits:
                department = department.replace('(in alphabetical order)', '').strip()
                for c in crew:
                    name = c[0]
                    roles = c[1]
                    self['credits'].append({
-                        'name': c[0],
+                        'name': name,
-                        'roles': c[1],
+                        'roles': roles,
                        'deparment': department
                    })
                    if department == 'Music Department':
                        if 'lyricist' in roles:
                            self['lyricist'].append(name)
                        if 'playback singer' in roles:
                            self['singer'].append(name)
            if not self['credits']:
                del self['credits']
Author	SHA1	Message	Date
j	7041d1b316	parse lyricist, singer	2018-07-09 15:20:13 +02:00
j	d21b0b675c	criterion is https now	2018-07-09 15:20:00 +02:00