scrapeit/scrapeit/tvrss.py
2007-03-02 20:44:43 +00:00

245 lines
6.9 KiB
Python
Executable file

#!/usr/bin/env python
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
from os.path import *
import sys
import datetime
import time
import re
from urllib2 import urlopen
from urllib import quote
import Image
import StringIO
import feedparser
from utils import read_url
hr_hdtv = re.compile('HR HDTV')
hdtv = re.compile('HDTV')
def get_url(title):
return title.replace(' ','_').replace('/', '_').lower()
def get_show(string):
return string.split(';')[0].split(':')[1].strip()
def get_title(string):
title = string.split(';')[1].split(':')[1].strip()
if title != 'n/a':
return title
return ''
def get_season(string):
try:
season = int(string.split(';')[2].split(':')[1].strip())
except:
return None
return season
def get_episode(string):
try:
episode = int(string.split(';')[3].split(':')[1].strip())
except:
return None
return episode
def get_episodedate(string):
s = string.split('Episode Date:')
if len(s) == 2:
return s[1].strip()
return None
def get_episode_string(string):
episode = get_episode(string)
season = get_season(string)
episodedate = get_episodedate(string)
estring = None
if season and episode:
estring = "S%02dE%02d" % (season, episode)
elif episodedate:
estring = episodedate
return estring
def choose_item(old, new):
if old['link'] == new['link']:
return False
if not hdtv.search(old['title']):
if hdtv.search(new['title']):
display_item(new)
log.debug("vs.")
display_item(old)
return True
return False
def get_episodes(show_title):
search_url = "http://tvrss.net/search/index.php?distribution_group=combined&show_name=%s&show_name_exact=true&filename=&date=&quality=&release_group=&mode=rss" % quote(show_title)
data = read_url(search_url)
fd = feedparser.parse(search_url)
episodes = {}
for t in fd.entries:
episode = get_episode_string(t['summary'])
episodes[episode] = t['enclosures'][0]['href']
return episodes
def get_thumbnail(url):
try:
thumbnail = read_url(url)
im = Image.open(StringIO.StringIO(thumbnail))
out = StringIO.StringIO()
width = 100
height = int((100.0 / im.size[0]) * im.size[1])
im = im.resize((width, height))
im.crop((0,0,100,100)).convert().save(out, 'JPEG')
thumbnail = out.getvalue()
except:
thumbnail = None
return thumbnail
def get_imdbdata(imdbid):
thumbnail = None
description=''
i = imdb.IMDb(imdbid).parse()
if i:
poster = i['poster']
if poster != 'http://i.imdb.com/Heads/npa.gif':
log.debug("getting poster %s" % poster)
thumbnail = get_thumbnail(poster)
if i['plot']:
description=i['plot']
elif i['plot_outline']:
description=i['plot_outline']
else:
description=i['tagline']
return (i, description, thumbnail)
else:
return(i, '', None)
def load():
log.debug("getting new shows from tvrss...")
feed = feedparser.parse('http://tvrss.net/feed/combined/')
shows = {}
for item in feed['entries']:
show = get_show(item['description'])
estring = get_episode_string(item['description'])
if estring:
if show and not hr_hdtv.search(item['title']):
if shows.has_key(show):
if shows[show].has_key(estring):
if choose_item(shows[show][estring], item):
shows[show][estring] = item
else:
shows[show][estring] = item
else:
shows[show] = {}
shows[show][estring] = item
for show in shows:
imdb = None
try:
model.ShowsBlacklist.byShowUrl(get_url(show))
log.debug("ignoring blacklisted show %s" % show)
continue
except:
pass
s = None
try:
s = model.Shows.byUrl(get_url(show))
except SQLObjectNotFound:
try:
alias = model.ShowsAlias.byAlias(get_url(show))
s = alias.show
except SQLObjectNotFound:
s = None
if not s:
log.debug("about to add %s" % show)
thumbnail = None
description=''
ur = '-'
try:
imdbid = IMDb.guess(show)
if imdbid:
imdb, description, thumbnail = get_imdbdata(imdbid)
if imdb:
ur = imdb['rating']
except:
import traceback
print ptraceback.print_exc()
pass
s= model.Shows(
title = show,
url = get_url(show),
description = description,
imdb = imdbid,
imdbUserRating = ur
)
s.thumbnail = thumbnail
meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
if meta:
s.metacriticUrl = meta['url']
s.metacriticScore = "%s" % meta['score']
for review in meta['critics']:
model.addReview(s, review)
model.hub.commit()
log.debug('added %s' % show)
for episode in shows[show]:
episode_title = get_title(shows[show][episode]['description'])
episode_description = ''
episode_imdb = ''
q = model.Episodes.select(AND(
model.Episodes.q.showID == s.id,
model.Episodes.q.episode == episode))
if q.count() == 0:
if not imdb:
try:
imdbid = IMDb.guess(show)
if imdbid:
imdb = IMDb.parse(imdbid)
except:
pass
if imdb and imdb['episodes'].has_key(episode):
episode_title = imdb['episodes'][episode]['title']
episode_description = imdb['episodes'][episode]['description']
episode_imdb = imdb['episodes'][episode]['imdb']
if not episode_description or not episode_title:
tvcom_data = tvcom.get(show, episode)
if not episode_description:
episode_description = tvcom_data['description']
if not episode_title:
episode_title = tvcom_data['title']
e = model.Episodes(
showID = s.id,
title = episode_title,
episode = episode,
torrent = shows[show][episode]['enclosures'][0]['href'],
description = episode_description,
imdb = episode_imdb,
thumbnail = None,
pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
)
s.lastUpdate = datetime.datetime.now()
model.hub.commit()
log.debug("from tvrss add %s %s" %(episode, show))
log.debug("updating tvrss done.")
if __name__ == '__main__':
# first look on the command line for a desired config file,
# if it's not on the command line, then
# look for setup.py in this directory. If it's not there, this script is
# probably installed
if len(sys.argv) > 1:
turbogears.update_config(configfile=sys.argv[1],
modulename="btvcr.config")
elif exists(join(dirname(__file__), "setup.py")):
turbogears.update_config(configfile="dev.cfg",
modulename="btvcr.config")
else:
turbogears.update_config(configfile="prod.cfg",
modulename="btvcr.config")
from btvcr.controllers import Root
load()