This commit is contained in:
j 2007-11-29 21:25:19 +00:00
parent fccc9006d4
commit 73ec7e7aeb
3 changed files with 63 additions and 227 deletions

60
scrapeit/metacritic.py Normal file
View file

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
from utils import read_url, read_url_utf8
import re
from urllib import quote
def getMetacriticShowUrl(title):
title = quote(title)
search_url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
search = read_url(search_url)
urls = re.compile('(http://www.metacritic.com/tv/shows/.*?)\?').findall(search)
if urls:
return urls[0]
else:
return ''
def scrapeMetacritic(title, url=None):
if not url:
url = getMetacriticShowUrl(title)
if not url:
return None
data = read_url(url)
score = re.compile('''ALT="Metascore: (.*?)"''').findall(data)
if score:
score = int(score[0])
else:
score = -1
reviews = re.compile('(<DIV CLASS="scoreandreview">.*)').findall(data.replace('\n',''))[0]
reg = '''<DIV CLASS="scoreandreview"><DIV CLASS="criticscore">(.*?)</DIV>.*?
<SPAN CLASS="publication">(.*?)
<DIV CLASS="quote">(.*?)
</DIV>.*?</DIV>.*?</DIV>
'''.replace('\n','')
reviews = re.compile(reg).findall(reviews)
metacritics = []
for r in reviews:
critic ={}
critic['score'] = int(r[0].strip())
publication = r[1].split('</SPAN>')
criticname = ''
if len(publication) > 1:
criticname = publication[1].replace('<SPAN CLASS="criticname">','').strip()
publication = publication[0]
critic['publication'] = publication
critic['critic'] = criticname
quote = r[2].split('<BR>')
link = ''
if len(quote) > 1:
link = re.compile('<A HREF="(.*?)" TARGET="_blank"><IMG SRC="/_images/readreview.gif"').findall(quote[1])[0]
quote = quote[0].strip()
critic['quote'] = quote
critic['link'] = link
metacritics.append(critic)
return dict(score = score, critics = metacritics, url = url)

View file

@ -2,79 +2,13 @@
# -*- Mode: Python; -*- # -*- Mode: Python; -*-
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=2:sts=2:ts=2
from os.path import *
import sys
import datetime
import time
import re
from urllib2 import urlopen
from urllib import quote from urllib import quote
import Image
import StringIO
import feedparser import feedparser
from utils import read_url from utils import read_url
hr_hdtv = re.compile('HR HDTV')
hdtv = re.compile('HDTV')
def get_url(title):
return title.replace(' ','_').replace('/', '_').lower()
def get_show(string):
return string.split(';')[0].split(':')[1].strip()
def get_title(string):
title = string.split(';')[1].split(':')[1].strip()
if title != 'n/a':
return title
return ''
def get_season(string):
try:
season = int(string.split(';')[2].split(':')[1].strip())
except:
return None
return season
def get_episode(string):
try:
episode = int(string.split(';')[3].split(':')[1].strip())
except:
return None
return episode
def get_episodedate(string):
s = string.split('Episode Date:')
if len(s) == 2:
return s[1].strip()
return None
def get_episode_string(string):
episode = get_episode(string)
season = get_season(string)
episodedate = get_episodedate(string)
estring = None
if season and episode:
estring = "S%02dE%02d" % (season, episode)
elif episodedate:
estring = episodedate
return estring
def choose_item(old, new):
if old['link'] == new['link']:
return False
if not hdtv.search(old['title']):
if hdtv.search(new['title']):
display_item(new)
log.debug("vs.")
display_item(old)
return True
return False
def get_episodes(show_title): def get_episodes(show_title):
search_url = "http://tvrss.net/search/index.php?distribution_group=combined&show_name=%s&show_name_exact=true&filename=&date=&quality=&release_group=&mode=rss" % quote(show_title) search_url = "http://tvrss.net/search/index.php?distribution_group=combined&show_name=%s&show_name_exact=true&filename=&date=&quality=&release_group=&mode=rss" % quote(show_title)
data = read_url(search_url) data = read_url(search_url)
@ -84,162 +18,3 @@ def get_episodes(show_title):
episode = get_episode_string(t['summary']) episode = get_episode_string(t['summary'])
episodes[episode] = t['enclosures'][0]['href'] episodes[episode] = t['enclosures'][0]['href']
return episodes return episodes
def get_thumbnail(url):
try:
thumbnail = read_url(url)
im = Image.open(StringIO.StringIO(thumbnail))
out = StringIO.StringIO()
width = 100
height = int((100.0 / im.size[0]) * im.size[1])
im = im.resize((width, height))
im.crop((0,0,100,100)).convert().save(out, 'JPEG')
thumbnail = out.getvalue()
except:
thumbnail = None
return thumbnail
def get_imdbdata(imdbid):
thumbnail = None
description=''
i = imdb.IMDb(imdbid).parse()
if i:
poster = i['poster']
if poster != 'http://i.imdb.com/Heads/npa.gif':
log.debug("getting poster %s" % poster)
thumbnail = get_thumbnail(poster)
if i['plot']:
description=i['plot']
elif i['plot_outline']:
description=i['plot_outline']
else:
description=i['tagline']
return (i, description, thumbnail)
else:
return(i, '', None)
def load():
log.debug("getting new shows from tvrss...")
feed = feedparser.parse('http://tvrss.net/feed/combined/')
shows = {}
for item in feed['entries']:
show = get_show(item['description'])
estring = get_episode_string(item['description'])
if estring:
if show and not hr_hdtv.search(item['title']):
if shows.has_key(show):
if shows[show].has_key(estring):
if choose_item(shows[show][estring], item):
shows[show][estring] = item
else:
shows[show][estring] = item
else:
shows[show] = {}
shows[show][estring] = item
for show in shows:
imdb = None
try:
model.ShowsBlacklist.byShowUrl(get_url(show))
log.debug("ignoring blacklisted show %s" % show)
continue
except:
pass
s = None
try:
s = model.Shows.byUrl(get_url(show))
except SQLObjectNotFound:
try:
alias = model.ShowsAlias.byAlias(get_url(show))
s = alias.show
except SQLObjectNotFound:
s = None
if not s:
log.debug("about to add %s" % show)
thumbnail = None
description=''
ur = '-'
try:
imdbid = IMDb.guess(show)
if imdbid:
imdb, description, thumbnail = get_imdbdata(imdbid)
if imdb:
ur = imdb['rating']
except:
import traceback
print ptraceback.print_exc()
pass
s= model.Shows(
title = show,
url = get_url(show),
description = description,
imdb = imdbid,
imdbUserRating = ur
)
s.thumbnail = thumbnail
meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
if meta:
s.metacriticUrl = meta['url']
s.metacriticScore = "%s" % meta['score']
for review in meta['critics']:
model.addReview(s, review)
model.hub.commit()
log.debug('added %s' % show)
for episode in shows[show]:
episode_title = get_title(shows[show][episode]['description'])
episode_description = ''
episode_imdb = ''
q = model.Episodes.select(AND(
model.Episodes.q.showID == s.id,
model.Episodes.q.episode == episode))
if q.count() == 0:
if not imdb:
try:
imdbid = IMDb.guess(show)
if imdbid:
imdb = IMDb.parse(imdbid)
except:
pass
if imdb and imdb['episodes'].has_key(episode):
episode_title = imdb['episodes'][episode]['title']
episode_description = imdb['episodes'][episode]['description']
episode_imdb = imdb['episodes'][episode]['imdb']
if not episode_description or not episode_title:
tvcom_data = tvcom.get(show, episode)
if not episode_description:
episode_description = tvcom_data['description']
if not episode_title:
episode_title = tvcom_data['title']
e = model.Episodes(
showID = s.id,
title = episode_title,
episode = episode,
torrent = shows[show][episode]['enclosures'][0]['href'],
description = episode_description,
imdb = episode_imdb,
thumbnail = None,
pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
)
s.lastUpdate = datetime.datetime.now()
model.hub.commit()
log.debug("from tvrss add %s %s" %(episode, show))
log.debug("updating tvrss done.")
if __name__ == '__main__':
# first look on the command line for a desired config file,
# if it's not on the command line, then
# look for setup.py in this directory. If it's not there, this script is
# probably installed
if len(sys.argv) > 1:
turbogears.update_config(configfile=sys.argv[1],
modulename="btvcr.config")
elif exists(join(dirname(__file__), "setup.py")):
turbogears.update_config(configfile="dev.cfg",
modulename="btvcr.config")
else:
turbogears.update_config(configfile="prod.cfg",
modulename="btvcr.config")
from btvcr.controllers import Root
load()

View file

@ -126,7 +126,8 @@ def html_entity_decode(s, encoding = 'utf-8'):
def stripTags(s): def stripTags(s):
if s: if s:
return djangohtml.strip_tags(htmldecode(s)).strip() s = htmldecode(s)
return djangohtml.strip_tags(s).strip()
return u'' return u''
strip_tags=stripTags strip_tags=stripTags
@ -140,7 +141,7 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def htmldecode(text): def htmldecode(text):
"""Decode HTML entities in the given text.""" """Decode HTML entities in the given text."""
if type(text) != unicode: if type(text) != unicode:
text = unicode(text) text = unicode(text)[:]
if type(text) is unicode: if type(text) is unicode:
uchr = unichr uchr = unichr
else: else: