diff --git a/ox/web/metacritic.py b/ox/web/metacritic.py index 34e20a3..adb6100 100644 --- a/ox/web/metacritic.py +++ b/ox/web/metacritic.py @@ -2,10 +2,16 @@ # vi:si:et:sw=4:sts=4:ts=4 import re from urllib import quote +from lxml.html import document_fromstring from ox.cache import readUrl, readUrlUnicode -from ox import findRe, decodeHtml, stripTags +from ox import findRe, stripTags +def getUrlByImdb(imdb): + url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb + data = readUrl(url) + metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"') + return metacritic_url or None def getMetacriticShowUrl(title): title = quote(title) @@ -13,33 +19,39 @@ def getMetacriticShowUrl(title): data = readUrl(url) return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?') -def getData(title, url=None): - if not url: - url = getMetacriticShowUrl(title) - if not url: - return None - data = readUrlUnicode(url) - score = findRe(data, 'ALT="Metascore: (.*?)"') - if score: - score = int(score) - else: - score = -1 +def getData(url): + data = readUrlUnicode(url) + doc = document_fromstring(data) + score = filter(lambda s: s.attrib.get('property') == 'v:average', + doc.xpath('//span[@class="score_value"]')) + if score: + score = int(score[0].text) + else: + score = -1 + authors = [a.text + for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')] + publications = [d.text + for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')] + reviews = [d.text + for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')] + scores = [int(d.text.strip()) + for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')] + links = [a.attrib['href'] + for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')] - reviews = re.compile( - '