cleanup

2007-11-29 21:25:19 +00:00 · 2007-11-29 21:25:19 +00:00 · 73ec7e7aeb
commit 73ec7e7aeb
parent fccc9006d4
3 changed files with 63 additions and 227 deletions
--- a/scrapeit/metacritic.py
+++ b/scrapeit/metacritic.py
@ -0,0 +1,60 @@
 # -*- coding: utf-8 -*-
 from utils  import read_url, read_url_utf8
 import re
 from urllib import quote
 def getMetacriticShowUrl(title):
  title = quote(title)
  search_url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
  search = read_url(search_url)
  urls = re.compile('(http://www.metacritic.com/tv/shows/.*?)\?').findall(search)
  if urls: 
    return urls[0]
  else:
    return ''
 def scrapeMetacritic(title, url=None):
  if not url:
    url = getMetacriticShowUrl(title)
  if not url:
    return None
  data = read_url(url)
  score = re.compile('''ALT="Metascore: (.*?)"''').findall(data)
  if score: 
    score = int(score[0])
  else: 
    score = -1
  reviews = re.compile('(<DIV CLASS="scoreandreview">.*)').findall(data.replace('\n',''))[0]
  reg = '''<DIV CLASS="scoreandreview"><DIV CLASS="criticscore">(.*?)</DIV>.*?
 <SPAN CLASS="publication">(.*?)
 <DIV CLASS="quote">(.*?)
 </DIV>.*?</DIV>.*?</DIV>
 '''.replace('\n','')
  reviews = re.compile(reg).findall(reviews)
  metacritics = []
  for r in reviews:
    critic ={}
    critic['score'] = int(r[0].strip())
    publication = r[1].split('</SPAN>')
    criticname = ''
    if len(publication) > 1:
      criticname = publication[1].replace('<SPAN CLASS="criticname">','').strip()
    publication = publication[0]      
    critic['publication'] = publication
    critic['critic'] = criticname
    quote = r[2].split('<BR>')
    link = ''
    if len(quote) > 1:
      link = re.compile('<A HREF="(.*?)" TARGET="_blank"><IMG SRC="/_images/readreview.gif"').findall(quote[1])[0]
    quote = quote[0].strip()
    critic['quote'] = quote
    critic['link'] = link
    metacritics.append(critic)
  return dict(score = score, critics = metacritics, url = url)
--- a/scrapeit/tvrss.py
+++ b/scrapeit/tvrss.py
@ -2,79 +2,13 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 from os.path import *
 import sys
 import datetime
 import time
 import re
 from urllib2 import urlopen
 from urllib import quote
 import Image
 import StringIO
 import feedparser
 from  utils import read_url
 hr_hdtv = re.compile('HR HDTV')  
 hdtv = re.compile('HDTV')
 def get_url(title):
    return title.replace(' ','_').replace('/', '_').lower()
 def get_show(string):
  return string.split(';')[0].split(':')[1].strip()
 def get_title(string):
  title = string.split(';')[1].split(':')[1].strip()
  if title != 'n/a':
    return title
  return ''
 def get_season(string):
  try:
    season = int(string.split(';')[2].split(':')[1].strip())
  except:
    return None
  return season
 def get_episode(string):
  try:
    episode = int(string.split(';')[3].split(':')[1].strip())
  except:
    return None
  return episode
 def get_episodedate(string):
  s = string.split('Episode Date:')
  if len(s) == 2:
    return s[1].strip()
  return None
 def get_episode_string(string):
  episode = get_episode(string)
  season = get_season(string)
  episodedate = get_episodedate(string)
  estring = None
  if season and episode:
    estring = "S%02dE%02d" % (season, episode)
  elif episodedate:
    estring = episodedate
  return estring
 def choose_item(old, new):
  if old['link'] == new['link']:
    return False  
  if not hdtv.search(old['title']):
    if hdtv.search(new['title']):
      display_item(new)
      log.debug("vs.")
      display_item(old)
      return True
  return False
 def get_episodes(show_title):
  search_url = "http://tvrss.net/search/index.php?distribution_group=combined&show_name=%s&show_name_exact=true&filename=&date=&quality=&release_group=&mode=rss" % quote(show_title)
  data = read_url(search_url)
@ -84,162 +18,3 @@ def get_episodes(show_title):
    episode = get_episode_string(t['summary'])
    episodes[episode] = t['enclosures'][0]['href']
  return episodes
 def get_thumbnail(url):
  try:
    thumbnail = read_url(url)
    im = Image.open(StringIO.StringIO(thumbnail))
    out = StringIO.StringIO()
    width = 100
    height = int((100.0 / im.size[0]) * im.size[1])
    im = im.resize((width, height))
    im.crop((0,0,100,100)).convert().save(out, 'JPEG')
    thumbnail = out.getvalue()
  except:
    thumbnail = None
  return thumbnail
 def get_imdbdata(imdbid):
  thumbnail = None
  description=''
  i = imdb.IMDb(imdbid).parse()
  if i:
    poster = i['poster']
    if poster != 'http://i.imdb.com/Heads/npa.gif':
      log.debug("getting poster %s" % poster)
      thumbnail = get_thumbnail(poster)
    if i['plot']:
      description=i['plot']
    elif i['plot_outline']:
      description=i['plot_outline']
    else:
      description=i['tagline']
    return (i, description, thumbnail)
  else:
    return(i, '', None)
 def load():
  log.debug("getting new shows from tvrss...")
  feed = feedparser.parse('http://tvrss.net/feed/combined/')
  shows = {}
  for item in feed['entries']:
    show = get_show(item['description'])
    estring = get_episode_string(item['description'])
    if estring:
      if show and not hr_hdtv.search(item['title']):
        if shows.has_key(show):
          if shows[show].has_key(estring):
            if choose_item(shows[show][estring], item):
              shows[show][estring] = item
          else:
            shows[show][estring] = item
        else:
          shows[show] = {}
          shows[show][estring] = item
  for show in shows:
    imdb = None
    try:
      model.ShowsBlacklist.byShowUrl(get_url(show))
      log.debug("ignoring blacklisted show %s" % show)
      continue
    except:
      pass
    s = None
    try:
      s =  model.Shows.byUrl(get_url(show))
    except SQLObjectNotFound:
      try:
        alias = model.ShowsAlias.byAlias(get_url(show))
        s = alias.show
      except SQLObjectNotFound:
        s = None
    if not s:
      log.debug("about to add %s" % show)
      thumbnail = None
      description=''
      ur = '-'
      try:
        imdbid = IMDb.guess(show)
        if imdbid:
          imdb, description, thumbnail = get_imdbdata(imdbid)
          if imdb:
            ur = imdb['rating']
      except:
        import traceback
        print ptraceback.print_exc()
        pass
      s= model.Shows(
        title = show,
        url = get_url(show),
        description = description,
        imdb = imdbid,
        imdbUserRating = ur
      )
      s.thumbnail = thumbnail
      meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
      if meta:
        s.metacriticUrl = meta['url']
        s.metacriticScore =  "%s" % meta['score']
        for review in  meta['critics']:
          model.addReview(s, review)
      model.hub.commit()
      log.debug('added %s' % show)
    for episode in shows[show]:
      episode_title = get_title(shows[show][episode]['description'])
      episode_description = ''
      episode_imdb = ''
      q = model.Episodes.select(AND(
              model.Episodes.q.showID == s.id,
              model.Episodes.q.episode == episode))
      if q.count() == 0:
        if not imdb:
          try:
            imdbid = IMDb.guess(show)
            if imdbid:
              imdb = IMDb.parse(imdbid)
          except:
            pass
        if imdb and imdb['episodes'].has_key(episode):
          episode_title  = imdb['episodes'][episode]['title']
          episode_description = imdb['episodes'][episode]['description']
          episode_imdb = imdb['episodes'][episode]['imdb']
        if not episode_description or not episode_title:
          tvcom_data = tvcom.get(show, episode)
          if not episode_description: 
            episode_description = tvcom_data['description']
          if not episode_title: 
            episode_title = tvcom_data['title']
        e = model.Episodes(
          showID = s.id,
          title = episode_title,
          episode = episode,
          torrent = shows[show][episode]['enclosures'][0]['href'],
          description = episode_description,
          imdb = episode_imdb,
          thumbnail = None,
          pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
        )
        s.lastUpdate = datetime.datetime.now()
        model.hub.commit()
        log.debug("from tvrss add %s %s" %(episode, show))
  log.debug("updating tvrss done.")
 if __name__ == '__main__':
  # first look on the command line for a desired config file,
  # if it's not on the command line, then
  # look for setup.py in this directory. If it's not there, this script is
  # probably installed
  if len(sys.argv) > 1:
      turbogears.update_config(configfile=sys.argv[1], 
          modulename="btvcr.config")
  elif exists(join(dirname(__file__), "setup.py")):
      turbogears.update_config(configfile="dev.cfg",
          modulename="btvcr.config")
  else:
      turbogears.update_config(configfile="prod.cfg",
          modulename="btvcr.config")
  from btvcr.controllers import Root
  load()
--- a/scrapeit/utils.py
+++ b/scrapeit/utils.py
@ -126,7 +126,8 @@ def html_entity_decode(s, encoding = 'utf-8'):
 def stripTags(s):
  if s:
-    return djangohtml.strip_tags(htmldecode(s)).strip()
+    s = htmldecode(s)
    return djangohtml.strip_tags(s).strip()
  return u''
 strip_tags=stripTags
@ -140,7 +141,7 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
 def htmldecode(text):
  """Decode HTML entities in the given text."""
  if type(text) != unicode:
-    text = unicode(text)
+   text = unicode(text)[:]
  if type(text) is unicode:
    uchr = unichr
  else: