cleanup

2007-11-29 21:25:19 +00:00 · 2007-11-29 21:25:19 +00:00 · 73ec7e7aeb
commit 73ec7e7aeb
parent fccc9006d4
3 changed files with 63 additions and 227 deletions
--- a/scrapeit/metacritic.py
+++ b/scrapeit/metacritic.py
@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+
+from utils  import read_url, read_url_utf8
+import re
+from urllib import quote
+
+def getMetacriticShowUrl(title):
+  title = quote(title)
+  search_url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
+  search = read_url(search_url)
+  urls = re.compile('(http://www.metacritic.com/tv/shows/.*?)\?').findall(search)
+  if urls: 
+    return urls[0]
+  else:
+    return ''
+
+def scrapeMetacritic(title, url=None):
+  if not url:
+    url = getMetacriticShowUrl(title)
+  if not url:
+    return None
+  data = read_url(url)
+  score = re.compile('''ALT="Metascore: (.*?)"''').findall(data)
+  if score: 
+    score = int(score[0])
+  else: 
+    score = -1
+  
+  reviews = re.compile('(<DIV CLASS="scoreandreview">.*)').findall(data.replace('\n',''))[0]
+  
+  reg = '''<DIV CLASS="scoreandreview"><DIV CLASS="criticscore">(.*?)</DIV>.*?
+<SPAN CLASS="publication">(.*?)
+<DIV CLASS="quote">(.*?)
+</DIV>.*?</DIV>.*?</DIV>
+'''.replace('\n','')
+  reviews = re.compile(reg).findall(reviews)
+
+  metacritics = []
+  for r in reviews:
+    critic ={}
+    critic['score'] = int(r[0].strip())
+    publication = r[1].split('</SPAN>')
+    criticname = ''
+    if len(publication) > 1:
+      criticname = publication[1].replace('<SPAN CLASS="criticname">','').strip()
+    publication = publication[0]      
+    critic['publication'] = publication
+    critic['critic'] = criticname
+    quote = r[2].split('<BR>')
+    link = ''
+    if len(quote) > 1:
+      link = re.compile('<A HREF="(.*?)" TARGET="_blank"><IMG SRC="/_images/readreview.gif"').findall(quote[1])[0]
+    quote = quote[0].strip()
+    critic['quote'] = quote
+    critic['link'] = link
+    metacritics.append(critic)
+
+  return dict(score = score, critics = metacritics, url = url)
+
+  
--- a/scrapeit/tvrss.py
+++ b/scrapeit/tvrss.py
@ -2,79 +2,13 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
-
-from os.path import *
-import sys
-import datetime
-import time
-import re
-from urllib2 import urlopen
 from urllib import quote
-import Image
-import StringIO

 import feedparser

 from  utils import read_url


-hr_hdtv = re.compile('HR HDTV')  
-hdtv = re.compile('HDTV')
-
-def get_url(title):
-    return title.replace(' ','_').replace('/', '_').lower()
-
-def get_show(string):
-  return string.split(';')[0].split(':')[1].strip()
-
-def get_title(string):
-  title = string.split(';')[1].split(':')[1].strip()
-  if title != 'n/a':
-    return title
-  return ''
-
-def get_season(string):
-  try:
-    season = int(string.split(';')[2].split(':')[1].strip())
-  except:
-    return None
-  return season
-
-def get_episode(string):
-  try:
-    episode = int(string.split(';')[3].split(':')[1].strip())
-  except:
-    return None
-  return episode
-
-def get_episodedate(string):
-  s = string.split('Episode Date:')
-  if len(s) == 2:
-    return s[1].strip()
-  return None
-
-def get_episode_string(string):
-  episode = get_episode(string)
-  season = get_season(string)
-  episodedate = get_episodedate(string)
-  estring = None
-  if season and episode:
-    estring = "S%02dE%02d" % (season, episode)
-  elif episodedate:
-    estring = episodedate
-  return estring
-
-def choose_item(old, new):
-  if old['link'] == new['link']:
-    return False  
-  if not hdtv.search(old['title']):
-    if hdtv.search(new['title']):
-      display_item(new)
-      log.debug("vs.")
-      display_item(old)
-      return True
-  return False
-
 def get_episodes(show_title):
  search_url = "http://tvrss.net/search/index.php?distribution_group=combined&show_name=%s&show_name_exact=true&filename=&date=&quality=&release_group=&mode=rss" % quote(show_title)
  data = read_url(search_url)
@ -84,162 +18,3 @@ def get_episodes(show_title):
    episode = get_episode_string(t['summary'])
    episodes[episode] = t['enclosures'][0]['href']
  return episodes
-  
-def get_thumbnail(url):
-  try:
-    thumbnail = read_url(url)
-    im = Image.open(StringIO.StringIO(thumbnail))
-    out = StringIO.StringIO()
-    width = 100
-    height = int((100.0 / im.size[0]) * im.size[1])
-    im = im.resize((width, height))
-    im.crop((0,0,100,100)).convert().save(out, 'JPEG')
-    thumbnail = out.getvalue()
-  except:
-    thumbnail = None
-  return thumbnail
-
-def get_imdbdata(imdbid):
-  thumbnail = None
-  description=''
-  i = imdb.IMDb(imdbid).parse()
-  if i:
-    poster = i['poster']
-    if poster != 'http://i.imdb.com/Heads/npa.gif':
-      log.debug("getting poster %s" % poster)
-      thumbnail = get_thumbnail(poster)
-    if i['plot']:
-      description=i['plot']
-    elif i['plot_outline']:
-      description=i['plot_outline']
-    else:
-      description=i['tagline']
-
-    return (i, description, thumbnail)
-  else:
-    return(i, '', None)
-
-
-def load():
-  log.debug("getting new shows from tvrss...")
-  feed = feedparser.parse('http://tvrss.net/feed/combined/')
-  shows = {}
-  for item in feed['entries']:
-    show = get_show(item['description'])
-    estring = get_episode_string(item['description'])
-    if estring:
-      if show and not hr_hdtv.search(item['title']):
-        if shows.has_key(show):
-          if shows[show].has_key(estring):
-            if choose_item(shows[show][estring], item):
-              shows[show][estring] = item
-          else:
-            shows[show][estring] = item
-        else:
-          shows[show] = {}
-          shows[show][estring] = item
-  for show in shows:
-    imdb = None
-    try:
-      model.ShowsBlacklist.byShowUrl(get_url(show))
-      log.debug("ignoring blacklisted show %s" % show)
-      continue
-    except:
-      pass
-    s = None
-    try:
-      s =  model.Shows.byUrl(get_url(show))
-    except SQLObjectNotFound:
-      try:
-        alias = model.ShowsAlias.byAlias(get_url(show))
-        s = alias.show
-      except SQLObjectNotFound:
-        s = None
-    if not s:
-      log.debug("about to add %s" % show)
-      thumbnail = None
-      description=''
-      ur = '-'
-      try:
-        imdbid = IMDb.guess(show)
-        if imdbid:
-          imdb, description, thumbnail = get_imdbdata(imdbid)
-          if imdb:
-            ur = imdb['rating']
-      except:
-        import traceback
-        print ptraceback.print_exc()
-        pass
-      s= model.Shows(
-        title = show,
-        url = get_url(show),
-        description = description,
-        imdb = imdbid,
-        imdbUserRating = ur
-      )
-      s.thumbnail = thumbnail
-      meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
-      if meta:
-        s.metacriticUrl = meta['url']
-        s.metacriticScore =  "%s" % meta['score']
-        for review in  meta['critics']:
-          model.addReview(s, review)
-      model.hub.commit()
-      log.debug('added %s' % show)
-    for episode in shows[show]:
-      episode_title = get_title(shows[show][episode]['description'])
-      episode_description = ''
-      episode_imdb = ''
-      q = model.Episodes.select(AND(
-              model.Episodes.q.showID == s.id,
-              model.Episodes.q.episode == episode))
-      if q.count() == 0:
-        if not imdb:
-          try:
-            imdbid = IMDb.guess(show)
-            if imdbid:
-              imdb = IMDb.parse(imdbid)
-          except:
-            pass
-        if imdb and imdb['episodes'].has_key(episode):
-          episode_title  = imdb['episodes'][episode]['title']
-          episode_description = imdb['episodes'][episode]['description']
-          episode_imdb = imdb['episodes'][episode]['imdb']
-        if not episode_description or not episode_title:
-          tvcom_data = tvcom.get(show, episode)
-          if not episode_description: 
-            episode_description = tvcom_data['description']
-          if not episode_title: 
-            episode_title = tvcom_data['title']
-        e = model.Episodes(
-          showID = s.id,
-          title = episode_title,
-          episode = episode,
-          torrent = shows[show][episode]['enclosures'][0]['href'],
-          description = episode_description,
-          imdb = episode_imdb,
-          thumbnail = None,
-          pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
-        )
-        s.lastUpdate = datetime.datetime.now()
-        model.hub.commit()
-        log.debug("from tvrss add %s %s" %(episode, show))
-  log.debug("updating tvrss done.")
-
-if __name__ == '__main__':
-  # first look on the command line for a desired config file,
-  # if it's not on the command line, then
-  # look for setup.py in this directory. If it's not there, this script is
-  # probably installed
-  if len(sys.argv) > 1:
-      turbogears.update_config(configfile=sys.argv[1], 
-          modulename="btvcr.config")
-  elif exists(join(dirname(__file__), "setup.py")):
-      turbogears.update_config(configfile="dev.cfg",
-          modulename="btvcr.config")
-  else:
-      turbogears.update_config(configfile="prod.cfg",
-          modulename="btvcr.config")
-
-  from btvcr.controllers import Root
-  load()
--- a/scrapeit/utils.py
+++ b/scrapeit/utils.py
@ -126,7 +126,8 @@ def html_entity_decode(s, encoding = 'utf-8'):

 def stripTags(s):
  if s:
-    return djangohtml.strip_tags(htmldecode(s)).strip()
+    s = htmldecode(s)
+    return djangohtml.strip_tags(s).strip()
  return u''

 strip_tags=stripTags
@ -140,7 +141,7 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
 def htmldecode(text):
  """Decode HTML entities in the given text."""
  if type(text) != unicode:
-    text = unicode(text)
+   text = unicode(text)[:]
  if type(text) is unicode:
    uchr = unichr
  else: