criterion: decode some html
This commit is contained in:
parent
86bffd67b3
commit
77f34143f5
1 changed files with 5 additions and 4 deletions
|
@ -5,7 +5,7 @@ import re
|
|||
|
||||
import ox.cache
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.html import strip_tags, decode_html
|
||||
from ox.text import find_re
|
||||
|
||||
import imdb
|
||||
|
@ -36,14 +36,15 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
html = ox.cache.read_url(data["url"], timeout=timeout)
|
||||
data["number"] = find_re(html, "<li>Spine #(\d+)")
|
||||
|
||||
data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>")
|
||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
|
||||
data["title"] = decode_html(find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>"))
|
||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip()
|
||||
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
results = find_re(html, '<div class="left_column">(.*?)</div>')
|
||||
results = re.compile("<li>(.*?)</li>").findall(results)
|
||||
data["country"] = results[0]
|
||||
data["year"] = results[1]
|
||||
data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>"))
|
||||
data["synopsis"] = decode_html(strip_tags(find_re(html,
|
||||
"<div class=\"content_block last\">.*?<p>(.*?)</p>")))
|
||||
|
||||
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
|
||||
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||
|
|
Loading…
Reference in a new issue