diff --git a/ox/web/metacritic.py b/ox/web/metacritic.py index 34e20a3..adb6100 100644 --- a/ox/web/metacritic.py +++ b/ox/web/metacritic.py @@ -2,10 +2,16 @@ # vi:si:et:sw=4:sts=4:ts=4 import re from urllib import quote +from lxml.html import document_fromstring from ox.cache import readUrl, readUrlUnicode -from ox import findRe, decodeHtml, stripTags +from ox import findRe, stripTags +def getUrlByImdb(imdb): + url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb + data = readUrl(url) + metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"') + return metacritic_url or None def getMetacriticShowUrl(title): title = quote(title) @@ -13,33 +19,39 @@ def getMetacriticShowUrl(title): data = readUrl(url) return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?') -def getData(title, url=None): - if not url: - url = getMetacriticShowUrl(title) - if not url: - return None - data = readUrlUnicode(url) - score = findRe(data, 'ALT="Metascore: (.*?)"') - if score: - score = int(score) - else: - score = -1 +def getData(url): + data = readUrlUnicode(url) + doc = document_fromstring(data) + score = filter(lambda s: s.attrib.get('property') == 'v:average', + doc.xpath('//span[@class="score_value"]')) + if score: + score = int(score[0].text) + else: + score = -1 + authors = [a.text + for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')] + publications = [d.text + for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')] + reviews = [d.text + for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')] + scores = [int(d.text.strip()) + for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')] + links = [a.attrib['href'] + for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')] - reviews = re.compile( - '
(.*?)
' - '.*?(.*?)' - '.*?(.*?)
' - '.*?
(.*?)
' - '.*?(.*?)') if '(' in r['title']: r['year'] = findRe(r['title'], '\((\d*?)\)') - r['title'] = re.sub('\((\d*?)\)', '', r['title']).strip() - r['synopsis'] = findRe(data, '(.*?)') - r['average rating'] = findRe(data, '
(.*?)
').strip() + r['title'] = stripTags(re.sub('\((\d*?)\)', '', r['title'])).strip() + r['summary'] = stripTags(findRe(data, '

(.*?)

')).strip() + r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ') + if not r['summary']: + r['summary'] = get_og(data, 'description') + + meter = re.compile('(.*?)').findall(data) + meter = filter(lambda m: m[1].isdigit(), meter) + if meter: + r['tomatometer'] = meter[0][1] + r['rating'] = findRe(data, 'Average Rating: ([\d.]+)/10') + r['user_score'] = findRe(data, '(\d+)') + r['user_rating'] = findRe(data, 'Average Rating: ([\d.]+)/5') + poster = get_og(data, 'image') + if poster and not 'poster_default.gif' in poster: + r['posters'] = [poster] + for key in r.keys(): + if not r[key]: + del r[key] return r