scrapeit/scrapeit/metacritic.py

# -*- coding: utf-8 -*-

from utils  import read_url, read_url_utf8
import re
from urllib import quote

def getMetacriticShowUrl(title):
  title = quote(title)
  search_url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
  search = read_url(search_url)
  urls = re.compile('(http://www.metacritic.com/tv/shows/.*?)\?').findall(search)
  if urls:
    return urls[0]
  else:
    return ''

def scrapeMetacritic(title, url=None):
  if not url:
    url = getMetacriticShowUrl(title)
  if not url:
    return None
  data = read_url(url)
  score = re.compile('''ALT="Metascore: (.*?)"''').findall(data)
  if score:
    score = int(score[0])
  else:
    score = -1

  reviews = re.compile('(<DIV CLASS="scoreandreview">.*)').findall(data.replace('\n',''))[0]

  reg = '''<DIV CLASS="scoreandreview"><DIV CLASS="criticscore">(.*?)</DIV>.*?
<SPAN CLASS="publication">(.*?)
<DIV CLASS="quote">(.*?)
</DIV>.*?</DIV>.*?</DIV>
'''.replace('\n','')
  reviews = re.compile(reg).findall(reviews)

  metacritics = []
  for r in reviews:
    critic ={}
    critic['score'] = int(r[0].strip())
    publication = r[1].split('</SPAN>')
    criticname = ''
    if len(publication) > 1:
      criticname = publication[1].replace('<SPAN CLASS="criticname">','').strip()
    publication = publication[0]
    critic['publication'] = publication
    critic['critic'] = criticname
    quote = r[2].split('<BR>')
    link = ''
    if len(quote) > 1:
      link = re.compile('<A HREF="(.*?)" TARGET="_blank"><IMG SRC="/_images/readreview.gif"').findall(quote[1])[0]
    quote = quote[0].strip()
    critic['quote'] = quote
    critic['link'] = link
    metacritics.append(critic)

  return dict(score = score, critics = metacritics, url = url)