cleanup
This commit is contained in:
parent
fccc9006d4
commit
73ec7e7aeb
3 changed files with 63 additions and 227 deletions
60
scrapeit/metacritic.py
Normal file
60
scrapeit/metacritic.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from utils import read_url, read_url_utf8
|
||||
import re
|
||||
from urllib import quote
|
||||
|
||||
def getMetacriticShowUrl(title):
|
||||
title = quote(title)
|
||||
search_url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||
search = read_url(search_url)
|
||||
urls = re.compile('(http://www.metacritic.com/tv/shows/.*?)\?').findall(search)
|
||||
if urls:
|
||||
return urls[0]
|
||||
else:
|
||||
return ''
|
||||
|
||||
def scrapeMetacritic(title, url=None):
|
||||
if not url:
|
||||
url = getMetacriticShowUrl(title)
|
||||
if not url:
|
||||
return None
|
||||
data = read_url(url)
|
||||
score = re.compile('''ALT="Metascore: (.*?)"''').findall(data)
|
||||
if score:
|
||||
score = int(score[0])
|
||||
else:
|
||||
score = -1
|
||||
|
||||
reviews = re.compile('(<DIV CLASS="scoreandreview">.*)').findall(data.replace('\n',''))[0]
|
||||
|
||||
reg = '''<DIV CLASS="scoreandreview"><DIV CLASS="criticscore">(.*?)</DIV>.*?
|
||||
<SPAN CLASS="publication">(.*?)
|
||||
<DIV CLASS="quote">(.*?)
|
||||
</DIV>.*?</DIV>.*?</DIV>
|
||||
'''.replace('\n','')
|
||||
reviews = re.compile(reg).findall(reviews)
|
||||
|
||||
metacritics = []
|
||||
for r in reviews:
|
||||
critic ={}
|
||||
critic['score'] = int(r[0].strip())
|
||||
publication = r[1].split('</SPAN>')
|
||||
criticname = ''
|
||||
if len(publication) > 1:
|
||||
criticname = publication[1].replace('<SPAN CLASS="criticname">','').strip()
|
||||
publication = publication[0]
|
||||
critic['publication'] = publication
|
||||
critic['critic'] = criticname
|
||||
quote = r[2].split('<BR>')
|
||||
link = ''
|
||||
if len(quote) > 1:
|
||||
link = re.compile('<A HREF="(.*?)" TARGET="_blank"><IMG SRC="/_images/readreview.gif"').findall(quote[1])[0]
|
||||
quote = quote[0].strip()
|
||||
critic['quote'] = quote
|
||||
critic['link'] = link
|
||||
metacritics.append(critic)
|
||||
|
||||
return dict(score = score, critics = metacritics, url = url)
|
||||
|
||||
|
|
@ -2,79 +2,13 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
from os.path import *
|
||||
import sys
|
||||
import datetime
|
||||
import time
|
||||
import re
|
||||
from urllib2 import urlopen
|
||||
from urllib import quote
|
||||
import Image
|
||||
import StringIO
|
||||
|
||||
import feedparser
|
||||
|
||||
from utils import read_url
|
||||
|
||||
|
||||
hr_hdtv = re.compile('HR HDTV')
|
||||
hdtv = re.compile('HDTV')
|
||||
|
||||
def get_url(title):
|
||||
return title.replace(' ','_').replace('/', '_').lower()
|
||||
|
||||
def get_show(string):
|
||||
return string.split(';')[0].split(':')[1].strip()
|
||||
|
||||
def get_title(string):
|
||||
title = string.split(';')[1].split(':')[1].strip()
|
||||
if title != 'n/a':
|
||||
return title
|
||||
return ''
|
||||
|
||||
def get_season(string):
|
||||
try:
|
||||
season = int(string.split(';')[2].split(':')[1].strip())
|
||||
except:
|
||||
return None
|
||||
return season
|
||||
|
||||
def get_episode(string):
|
||||
try:
|
||||
episode = int(string.split(';')[3].split(':')[1].strip())
|
||||
except:
|
||||
return None
|
||||
return episode
|
||||
|
||||
def get_episodedate(string):
|
||||
s = string.split('Episode Date:')
|
||||
if len(s) == 2:
|
||||
return s[1].strip()
|
||||
return None
|
||||
|
||||
def get_episode_string(string):
|
||||
episode = get_episode(string)
|
||||
season = get_season(string)
|
||||
episodedate = get_episodedate(string)
|
||||
estring = None
|
||||
if season and episode:
|
||||
estring = "S%02dE%02d" % (season, episode)
|
||||
elif episodedate:
|
||||
estring = episodedate
|
||||
return estring
|
||||
|
||||
def choose_item(old, new):
|
||||
if old['link'] == new['link']:
|
||||
return False
|
||||
if not hdtv.search(old['title']):
|
||||
if hdtv.search(new['title']):
|
||||
display_item(new)
|
||||
log.debug("vs.")
|
||||
display_item(old)
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_episodes(show_title):
|
||||
search_url = "http://tvrss.net/search/index.php?distribution_group=combined&show_name=%s&show_name_exact=true&filename=&date=&quality=&release_group=&mode=rss" % quote(show_title)
|
||||
data = read_url(search_url)
|
||||
|
@ -84,162 +18,3 @@ def get_episodes(show_title):
|
|||
episode = get_episode_string(t['summary'])
|
||||
episodes[episode] = t['enclosures'][0]['href']
|
||||
return episodes
|
||||
|
||||
def get_thumbnail(url):
|
||||
try:
|
||||
thumbnail = read_url(url)
|
||||
im = Image.open(StringIO.StringIO(thumbnail))
|
||||
out = StringIO.StringIO()
|
||||
width = 100
|
||||
height = int((100.0 / im.size[0]) * im.size[1])
|
||||
im = im.resize((width, height))
|
||||
im.crop((0,0,100,100)).convert().save(out, 'JPEG')
|
||||
thumbnail = out.getvalue()
|
||||
except:
|
||||
thumbnail = None
|
||||
return thumbnail
|
||||
|
||||
def get_imdbdata(imdbid):
|
||||
thumbnail = None
|
||||
description=''
|
||||
i = imdb.IMDb(imdbid).parse()
|
||||
if i:
|
||||
poster = i['poster']
|
||||
if poster != 'http://i.imdb.com/Heads/npa.gif':
|
||||
log.debug("getting poster %s" % poster)
|
||||
thumbnail = get_thumbnail(poster)
|
||||
if i['plot']:
|
||||
description=i['plot']
|
||||
elif i['plot_outline']:
|
||||
description=i['plot_outline']
|
||||
else:
|
||||
description=i['tagline']
|
||||
|
||||
return (i, description, thumbnail)
|
||||
else:
|
||||
return(i, '', None)
|
||||
|
||||
|
||||
def load():
|
||||
log.debug("getting new shows from tvrss...")
|
||||
feed = feedparser.parse('http://tvrss.net/feed/combined/')
|
||||
shows = {}
|
||||
for item in feed['entries']:
|
||||
show = get_show(item['description'])
|
||||
estring = get_episode_string(item['description'])
|
||||
if estring:
|
||||
if show and not hr_hdtv.search(item['title']):
|
||||
if shows.has_key(show):
|
||||
if shows[show].has_key(estring):
|
||||
if choose_item(shows[show][estring], item):
|
||||
shows[show][estring] = item
|
||||
else:
|
||||
shows[show][estring] = item
|
||||
else:
|
||||
shows[show] = {}
|
||||
shows[show][estring] = item
|
||||
for show in shows:
|
||||
imdb = None
|
||||
try:
|
||||
model.ShowsBlacklist.byShowUrl(get_url(show))
|
||||
log.debug("ignoring blacklisted show %s" % show)
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
s = None
|
||||
try:
|
||||
s = model.Shows.byUrl(get_url(show))
|
||||
except SQLObjectNotFound:
|
||||
try:
|
||||
alias = model.ShowsAlias.byAlias(get_url(show))
|
||||
s = alias.show
|
||||
except SQLObjectNotFound:
|
||||
s = None
|
||||
if not s:
|
||||
log.debug("about to add %s" % show)
|
||||
thumbnail = None
|
||||
description=''
|
||||
ur = '-'
|
||||
try:
|
||||
imdbid = IMDb.guess(show)
|
||||
if imdbid:
|
||||
imdb, description, thumbnail = get_imdbdata(imdbid)
|
||||
if imdb:
|
||||
ur = imdb['rating']
|
||||
except:
|
||||
import traceback
|
||||
print ptraceback.print_exc()
|
||||
pass
|
||||
s= model.Shows(
|
||||
title = show,
|
||||
url = get_url(show),
|
||||
description = description,
|
||||
imdb = imdbid,
|
||||
imdbUserRating = ur
|
||||
)
|
||||
s.thumbnail = thumbnail
|
||||
meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
|
||||
if meta:
|
||||
s.metacriticUrl = meta['url']
|
||||
s.metacriticScore = "%s" % meta['score']
|
||||
for review in meta['critics']:
|
||||
model.addReview(s, review)
|
||||
model.hub.commit()
|
||||
log.debug('added %s' % show)
|
||||
for episode in shows[show]:
|
||||
episode_title = get_title(shows[show][episode]['description'])
|
||||
episode_description = ''
|
||||
episode_imdb = ''
|
||||
q = model.Episodes.select(AND(
|
||||
model.Episodes.q.showID == s.id,
|
||||
model.Episodes.q.episode == episode))
|
||||
if q.count() == 0:
|
||||
if not imdb:
|
||||
try:
|
||||
imdbid = IMDb.guess(show)
|
||||
if imdbid:
|
||||
imdb = IMDb.parse(imdbid)
|
||||
except:
|
||||
pass
|
||||
if imdb and imdb['episodes'].has_key(episode):
|
||||
episode_title = imdb['episodes'][episode]['title']
|
||||
episode_description = imdb['episodes'][episode]['description']
|
||||
episode_imdb = imdb['episodes'][episode]['imdb']
|
||||
if not episode_description or not episode_title:
|
||||
tvcom_data = tvcom.get(show, episode)
|
||||
if not episode_description:
|
||||
episode_description = tvcom_data['description']
|
||||
if not episode_title:
|
||||
episode_title = tvcom_data['title']
|
||||
e = model.Episodes(
|
||||
showID = s.id,
|
||||
title = episode_title,
|
||||
episode = episode,
|
||||
torrent = shows[show][episode]['enclosures'][0]['href'],
|
||||
description = episode_description,
|
||||
imdb = episode_imdb,
|
||||
thumbnail = None,
|
||||
pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
|
||||
)
|
||||
s.lastUpdate = datetime.datetime.now()
|
||||
model.hub.commit()
|
||||
log.debug("from tvrss add %s %s" %(episode, show))
|
||||
log.debug("updating tvrss done.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
# first look on the command line for a desired config file,
|
||||
# if it's not on the command line, then
|
||||
# look for setup.py in this directory. If it's not there, this script is
|
||||
# probably installed
|
||||
if len(sys.argv) > 1:
|
||||
turbogears.update_config(configfile=sys.argv[1],
|
||||
modulename="btvcr.config")
|
||||
elif exists(join(dirname(__file__), "setup.py")):
|
||||
turbogears.update_config(configfile="dev.cfg",
|
||||
modulename="btvcr.config")
|
||||
else:
|
||||
turbogears.update_config(configfile="prod.cfg",
|
||||
modulename="btvcr.config")
|
||||
|
||||
from btvcr.controllers import Root
|
||||
load()
|
||||
|
|
|
@ -126,7 +126,8 @@ def html_entity_decode(s, encoding = 'utf-8'):
|
|||
|
||||
def stripTags(s):
|
||||
if s:
|
||||
return djangohtml.strip_tags(htmldecode(s)).strip()
|
||||
s = htmldecode(s)
|
||||
return djangohtml.strip_tags(s).strip()
|
||||
return u''
|
||||
|
||||
strip_tags=stripTags
|
||||
|
@ -140,7 +141,7 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
|||
def htmldecode(text):
|
||||
"""Decode HTML entities in the given text."""
|
||||
if type(text) != unicode:
|
||||
text = unicode(text)
|
||||
text = unicode(text)[:]
|
||||
if type(text) is unicode:
|
||||
uchr = unichr
|
||||
else:
|
||||
|
|
Loading…
Reference in a new issue