some criterion fixes

This commit is contained in:
j 2018-05-07 09:42:15 +01:00
parent fc7769c4cb
commit 758acfe01b

View file

@ -36,15 +36,24 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
html = ox.cache.read_url(data["url"], timeout=timeout) html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = find_re(html, "<li>Spine #(\d+)") data["number"] = find_re(html, "<li>Spine #(\d+)")
data["title"] = decode_html(find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>")) data["title"] = decode_html(find_re(html, "<h1 class=\"header__primarytitle\".*?>(.*?)</h1>"))
data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip() data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip()
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>")) results = find_re(html, '<ul class="film-meta-list">(.*?)</ul>')
results = find_re(html, '<div class="left_column">(.*?)</div>') info = re.compile('<li itemprop="(.*?)".*?>(.*?)</li>', re.DOTALL).findall(results)
results = re.compile("<li>(.*?)</li>").findall(results) info = {k: strip_tags(v).strip() for k, v in info}
data["country"] = results[0] if 'director' in info:
data["year"] = results[1] data['director'] =info['director']
if 'countryOfOrigin' in info:
data['country'] =info['countryOfOrigin']
if 'inLanguage' in info:
data['language'] =info['inLanguage']
for v in re.compile('<li>(.*?)</li>', re.DOTALL).findall(results):
if 'datePublished' in v:
data['year'] = strip_tags(v).strip()
elif 'duration' in v:
data['duration'] = strip_tags(v).strip()
data["synopsis"] = decode_html(strip_tags(find_re(html, data["synopsis"] = decode_html(strip_tags(find_re(html,
"<div class=\"content_block last\">.*?<p>(.*?)</p>"))) "<div class=\"product-summary\".*?>.*?<p>(.*?)</p>")))
result = find_re(html, "<div class=\"purchase\">(.*?)</div>") result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result: if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
@ -63,6 +72,12 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
else: else:
data["posters"] = [] data["posters"] = []
data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']] data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
data['posters'] = [p for p in data['posters'] if p]
posters = find_re(html, '<div class="product-box-art".*?>(.*?)</div>')
for poster in re.compile('<img src="(.*?)"').findall(posters):
data['posters'].append(poster)
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"") result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result: if result:
data["stills"] = [result] data["stills"] = [result]