scrapeit/scrapeit/metacritic.py

60 lines
1.8 KiB
Python

# -*- coding: utf-8 -*-
from utils import read_url, read_url_utf8
import re
from urllib import quote
def getMetacriticShowUrl(title):
title = quote(title)
search_url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
search = read_url(search_url)
urls = re.compile('(http://www.metacritic.com/tv/shows/.*?)\?').findall(search)
if urls:
return urls[0]
else:
return ''
def scrapeMetacritic(title, url=None):
if not url:
url = getMetacriticShowUrl(title)
if not url:
return None
data = read_url(url)
score = re.compile('''ALT="Metascore: (.*?)"''').findall(data)
if score:
score = int(score[0])
else:
score = -1
reviews = re.compile('(<DIV CLASS="scoreandreview">.*)').findall(data.replace('\n',''))[0]
reg = '''<DIV CLASS="scoreandreview"><DIV CLASS="criticscore">(.*?)</DIV>.*?
<SPAN CLASS="publication">(.*?)
<DIV CLASS="quote">(.*?)
</DIV>.*?</DIV>.*?</DIV>
'''.replace('\n','')
reviews = re.compile(reg).findall(reviews)
metacritics = []
for r in reviews:
critic ={}
critic['score'] = int(r[0].strip())
publication = r[1].split('</SPAN>')
criticname = ''
if len(publication) > 1:
criticname = publication[1].replace('<SPAN CLASS="criticname">','').strip()
publication = publication[0]
critic['publication'] = publication
critic['critic'] = criticname
quote = r[2].split('<BR>')
link = ''
if len(quote) > 1:
link = re.compile('<A HREF="(.*?)" TARGET="_blank"><IMG SRC="/_images/readreview.gif"').findall(quote[1])[0]
quote = quote[0].strip()
critic['quote'] = quote
critic['link'] = link
metacritics.append(critic)
return dict(score = score, critics = metacritics, url = url)