criterion: decode some html

This commit is contained in:
j 2015-08-02 15:58:59 +02:00
parent 86bffd67b3
commit 77f34143f5

View file

@ -5,7 +5,7 @@ import re
import ox.cache
from ox.cache import read_url
from ox.html import strip_tags
from ox.html import strip_tags, decode_html
from ox.text import find_re
import imdb
@ -36,14 +36,15 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = find_re(html, "<li>Spine #(\d+)")
data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>")
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
data["title"] = decode_html(find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>"))
data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip()
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
results = find_re(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results)
data["country"] = results[0]
data["year"] = results[1]
data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>"))
data["synopsis"] = decode_html(strip_tags(find_re(html,
"<div class=\"content_block last\">.*?<p>(.*?)</p>")))
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result: