This commit is contained in:
j 2007-11-29 21:25:19 +00:00
parent fccc9006d4
commit 73ec7e7aeb
3 changed files with 63 additions and 227 deletions

60
scrapeit/metacritic.py Normal file
View file

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
from utils import read_url, read_url_utf8
import re
from urllib import quote
def getMetacriticShowUrl(title):
title = quote(title)
search_url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
search = read_url(search_url)
urls = re.compile('(http://www.metacritic.com/tv/shows/.*?)\?').findall(search)
if urls:
return urls[0]
else:
return ''
def scrapeMetacritic(title, url=None):
if not url:
url = getMetacriticShowUrl(title)
if not url:
return None
data = read_url(url)
score = re.compile('''ALT="Metascore: (.*?)"''').findall(data)
if score:
score = int(score[0])
else:
score = -1
reviews = re.compile('(<DIV CLASS="scoreandreview">.*)').findall(data.replace('\n',''))[0]
reg = '''<DIV CLASS="scoreandreview"><DIV CLASS="criticscore">(.*?)</DIV>.*?
<SPAN CLASS="publication">(.*?)
<DIV CLASS="quote">(.*?)
</DIV>.*?</DIV>.*?</DIV>
'''.replace('\n','')
reviews = re.compile(reg).findall(reviews)
metacritics = []
for r in reviews:
critic ={}
critic['score'] = int(r[0].strip())
publication = r[1].split('</SPAN>')
criticname = ''
if len(publication) > 1:
criticname = publication[1].replace('<SPAN CLASS="criticname">','').strip()
publication = publication[0]
critic['publication'] = publication
critic['critic'] = criticname
quote = r[2].split('<BR>')
link = ''
if len(quote) > 1:
link = re.compile('<A HREF="(.*?)" TARGET="_blank"><IMG SRC="/_images/readreview.gif"').findall(quote[1])[0]
quote = quote[0].strip()
critic['quote'] = quote
critic['link'] = link
metacritics.append(critic)
return dict(score = score, critics = metacritics, url = url)

View file

@ -2,79 +2,13 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
from os.path import *
import sys
import datetime
import time
import re
from urllib2 import urlopen
from urllib import quote
import Image
import StringIO
import feedparser
from utils import read_url
hr_hdtv = re.compile('HR HDTV')
hdtv = re.compile('HDTV')
def get_url(title):
return title.replace(' ','_').replace('/', '_').lower()
def get_show(string):
return string.split(';')[0].split(':')[1].strip()
def get_title(string):
title = string.split(';')[1].split(':')[1].strip()
if title != 'n/a':
return title
return ''
def get_season(string):
try:
season = int(string.split(';')[2].split(':')[1].strip())
except:
return None
return season
def get_episode(string):
try:
episode = int(string.split(';')[3].split(':')[1].strip())
except:
return None
return episode
def get_episodedate(string):
s = string.split('Episode Date:')
if len(s) == 2:
return s[1].strip()
return None
def get_episode_string(string):
episode = get_episode(string)
season = get_season(string)
episodedate = get_episodedate(string)
estring = None
if season and episode:
estring = "S%02dE%02d" % (season, episode)
elif episodedate:
estring = episodedate
return estring
def choose_item(old, new):
if old['link'] == new['link']:
return False
if not hdtv.search(old['title']):
if hdtv.search(new['title']):
display_item(new)
log.debug("vs.")
display_item(old)
return True
return False
def get_episodes(show_title):
search_url = "http://tvrss.net/search/index.php?distribution_group=combined&show_name=%s&show_name_exact=true&filename=&date=&quality=&release_group=&mode=rss" % quote(show_title)
data = read_url(search_url)
@ -84,162 +18,3 @@ def get_episodes(show_title):
episode = get_episode_string(t['summary'])
episodes[episode] = t['enclosures'][0]['href']
return episodes
def get_thumbnail(url):
try:
thumbnail = read_url(url)
im = Image.open(StringIO.StringIO(thumbnail))
out = StringIO.StringIO()
width = 100
height = int((100.0 / im.size[0]) * im.size[1])
im = im.resize((width, height))
im.crop((0,0,100,100)).convert().save(out, 'JPEG')
thumbnail = out.getvalue()
except:
thumbnail = None
return thumbnail
def get_imdbdata(imdbid):
thumbnail = None
description=''
i = imdb.IMDb(imdbid).parse()
if i:
poster = i['poster']
if poster != 'http://i.imdb.com/Heads/npa.gif':
log.debug("getting poster %s" % poster)
thumbnail = get_thumbnail(poster)
if i['plot']:
description=i['plot']
elif i['plot_outline']:
description=i['plot_outline']
else:
description=i['tagline']
return (i, description, thumbnail)
else:
return(i, '', None)
def load():
log.debug("getting new shows from tvrss...")
feed = feedparser.parse('http://tvrss.net/feed/combined/')
shows = {}
for item in feed['entries']:
show = get_show(item['description'])
estring = get_episode_string(item['description'])
if estring:
if show and not hr_hdtv.search(item['title']):
if shows.has_key(show):
if shows[show].has_key(estring):
if choose_item(shows[show][estring], item):
shows[show][estring] = item
else:
shows[show][estring] = item
else:
shows[show] = {}
shows[show][estring] = item
for show in shows:
imdb = None
try:
model.ShowsBlacklist.byShowUrl(get_url(show))
log.debug("ignoring blacklisted show %s" % show)
continue
except:
pass
s = None
try:
s = model.Shows.byUrl(get_url(show))
except SQLObjectNotFound:
try:
alias = model.ShowsAlias.byAlias(get_url(show))
s = alias.show
except SQLObjectNotFound:
s = None
if not s:
log.debug("about to add %s" % show)
thumbnail = None
description=''
ur = '-'
try:
imdbid = IMDb.guess(show)
if imdbid:
imdb, description, thumbnail = get_imdbdata(imdbid)
if imdb:
ur = imdb['rating']
except:
import traceback
print ptraceback.print_exc()
pass
s= model.Shows(
title = show,
url = get_url(show),
description = description,
imdb = imdbid,
imdbUserRating = ur
)
s.thumbnail = thumbnail
meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
if meta:
s.metacriticUrl = meta['url']
s.metacriticScore = "%s" % meta['score']
for review in meta['critics']:
model.addReview(s, review)
model.hub.commit()
log.debug('added %s' % show)
for episode in shows[show]:
episode_title = get_title(shows[show][episode]['description'])
episode_description = ''
episode_imdb = ''
q = model.Episodes.select(AND(
model.Episodes.q.showID == s.id,
model.Episodes.q.episode == episode))
if q.count() == 0:
if not imdb:
try:
imdbid = IMDb.guess(show)
if imdbid:
imdb = IMDb.parse(imdbid)
except:
pass
if imdb and imdb['episodes'].has_key(episode):
episode_title = imdb['episodes'][episode]['title']
episode_description = imdb['episodes'][episode]['description']
episode_imdb = imdb['episodes'][episode]['imdb']
if not episode_description or not episode_title:
tvcom_data = tvcom.get(show, episode)
if not episode_description:
episode_description = tvcom_data['description']
if not episode_title:
episode_title = tvcom_data['title']
e = model.Episodes(
showID = s.id,
title = episode_title,
episode = episode,
torrent = shows[show][episode]['enclosures'][0]['href'],
description = episode_description,
imdb = episode_imdb,
thumbnail = None,
pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
)
s.lastUpdate = datetime.datetime.now()
model.hub.commit()
log.debug("from tvrss add %s %s" %(episode, show))
log.debug("updating tvrss done.")
if __name__ == '__main__':
# first look on the command line for a desired config file,
# if it's not on the command line, then
# look for setup.py in this directory. If it's not there, this script is
# probably installed
if len(sys.argv) > 1:
turbogears.update_config(configfile=sys.argv[1],
modulename="btvcr.config")
elif exists(join(dirname(__file__), "setup.py")):
turbogears.update_config(configfile="dev.cfg",
modulename="btvcr.config")
else:
turbogears.update_config(configfile="prod.cfg",
modulename="btvcr.config")
from btvcr.controllers import Root
load()

View file

@ -126,7 +126,8 @@ def html_entity_decode(s, encoding = 'utf-8'):
def stripTags(s):
if s:
return djangohtml.strip_tags(htmldecode(s)).strip()
s = htmldecode(s)
return djangohtml.strip_tags(s).strip()
return u''
strip_tags=stripTags
@ -140,7 +141,7 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def htmldecode(text):
"""Decode HTML entities in the given text."""
if type(text) != unicode:
text = unicode(text)
text = unicode(text)[:]
if type(text) is unicode:
uchr = unichr
else: