From 73ec7e7aebd502284c9d648f0136a28cc7aecd80 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Thu, 29 Nov 2007 21:25:19 +0000 Subject: [PATCH] cleanup --- scrapeit/metacritic.py | 60 +++++++++++ scrapeit/tvrss.py | 225 ----------------------------------------- scrapeit/utils.py | 5 +- 3 files changed, 63 insertions(+), 227 deletions(-) create mode 100644 scrapeit/metacritic.py diff --git a/scrapeit/metacritic.py b/scrapeit/metacritic.py new file mode 100644 index 0000000..4a8f69f --- /dev/null +++ b/scrapeit/metacritic.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- + +from utils import read_url, read_url_utf8 +import re +from urllib import quote + +def getMetacriticShowUrl(title): + title = quote(title) + search_url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title + search = read_url(search_url) + urls = re.compile('(http://www.metacritic.com/tv/shows/.*?)\?').findall(search) + if urls: + return urls[0] + else: + return '' + +def scrapeMetacritic(title, url=None): + if not url: + url = getMetacriticShowUrl(title) + if not url: + return None + data = read_url(url) + score = re.compile('''ALT="Metascore: (.*?)"''').findall(data) + if score: + score = int(score[0]) + else: + score = -1 + + reviews = re.compile('(
.*)').findall(data.replace('\n',''))[0] + + reg = '''
(.*?)
.*? +(.*?) +
(.*?) +
.*?
.*?
+'''.replace('\n','') + reviews = re.compile(reg).findall(reviews) + + metacritics = [] + for r in reviews: + critic ={} + critic['score'] = int(r[0].strip()) + publication = r[1].split('') + criticname = '' + if len(publication) > 1: + criticname = publication[1].replace('','').strip() + publication = publication[0] + critic['publication'] = publication + critic['critic'] = criticname + quote = r[2].split('
') + link = '' + if len(quote) > 1: + link = re.compile(' 1: - turbogears.update_config(configfile=sys.argv[1], - modulename="btvcr.config") - elif exists(join(dirname(__file__), "setup.py")): - turbogears.update_config(configfile="dev.cfg", - modulename="btvcr.config") - else: - turbogears.update_config(configfile="prod.cfg", - modulename="btvcr.config") - - from btvcr.controllers import Root - load() diff --git a/scrapeit/utils.py b/scrapeit/utils.py index 3e53e22..732085c 100644 --- a/scrapeit/utils.py +++ b/scrapeit/utils.py @@ -126,7 +126,8 @@ def html_entity_decode(s, encoding = 'utf-8'): def stripTags(s): if s: - return djangohtml.strip_tags(htmldecode(s)).strip() + s = htmldecode(s) + return djangohtml.strip_tags(s).strip() return u'' strip_tags=stripTags @@ -140,7 +141,7 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?') def htmldecode(text): """Decode HTML entities in the given text.""" if type(text) != unicode: - text = unicode(text) + text = unicode(text)[:] if type(text) is unicode: uchr = unichr else: