From 73ec7e7aebd502284c9d648f0136a28cc7aecd80 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Thu, 29 Nov 2007 21:25:19 +0000
Subject: [PATCH] cleanup
---
scrapeit/metacritic.py | 60 +++++++++++
scrapeit/tvrss.py | 225 -----------------------------------------
scrapeit/utils.py | 5 +-
3 files changed, 63 insertions(+), 227 deletions(-)
create mode 100644 scrapeit/metacritic.py
diff --git a/scrapeit/metacritic.py b/scrapeit/metacritic.py
new file mode 100644
index 0000000..4a8f69f
--- /dev/null
+++ b/scrapeit/metacritic.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+
+from utils import read_url, read_url_utf8
+import re
+from urllib import quote
+
+def getMetacriticShowUrl(title):
+ title = quote(title)
+ search_url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
+ search = read_url(search_url)
+ urls = re.compile('(http://www.metacritic.com/tv/shows/.*?)\?').findall(search)
+ if urls:
+ return urls[0]
+ else:
+ return ''
+
+def scrapeMetacritic(title, url=None):
+ if not url:
+ url = getMetacriticShowUrl(title)
+ if not url:
+ return None
+ data = read_url(url)
+ score = re.compile('''ALT="Metascore: (.*?)"''').findall(data)
+ if score:
+ score = int(score[0])
+ else:
+ score = -1
+
+ reviews = re.compile('(
.*)').findall(data.replace('\n',''))[0]
+
+ reg = '''
(.*?)
.*?
+
(.*?)
+(.*?)
+
.*?.*?
+'''.replace('\n','')
+ reviews = re.compile(reg).findall(reviews)
+
+ metacritics = []
+ for r in reviews:
+ critic ={}
+ critic['score'] = int(r[0].strip())
+ publication = r[1].split('')
+ criticname = ''
+ if len(publication) > 1:
+ criticname = publication[1].replace('','').strip()
+ publication = publication[0]
+ critic['publication'] = publication
+ critic['critic'] = criticname
+ quote = r[2].split('
')
+ link = ''
+ if len(quote) > 1:
+ link = re.compile(' 1:
- turbogears.update_config(configfile=sys.argv[1],
- modulename="btvcr.config")
- elif exists(join(dirname(__file__), "setup.py")):
- turbogears.update_config(configfile="dev.cfg",
- modulename="btvcr.config")
- else:
- turbogears.update_config(configfile="prod.cfg",
- modulename="btvcr.config")
-
- from btvcr.controllers import Root
- load()
diff --git a/scrapeit/utils.py b/scrapeit/utils.py
index 3e53e22..732085c 100644
--- a/scrapeit/utils.py
+++ b/scrapeit/utils.py
@@ -126,7 +126,8 @@ def html_entity_decode(s, encoding = 'utf-8'):
def stripTags(s):
if s:
- return djangohtml.strip_tags(htmldecode(s)).strip()
+ s = htmldecode(s)
+ return djangohtml.strip_tags(s).strip()
return u''
strip_tags=stripTags
@@ -140,7 +141,7 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def htmldecode(text):
"""Decode HTML entities in the given text."""
if type(text) != unicode:
- text = unicode(text)
+ text = unicode(text)[:]
if type(text) is unicode:
uchr = unichr
else: