cleanup
This commit is contained in:
parent
fccc9006d4
commit
73ec7e7aeb
3 changed files with 63 additions and 227 deletions
60
scrapeit/metacritic.py
Normal file
60
scrapeit/metacritic.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from utils import read_url, read_url_utf8
|
||||||
|
import re
|
||||||
|
from urllib import quote
|
||||||
|
|
||||||
|
def getMetacriticShowUrl(title):
|
||||||
|
title = quote(title)
|
||||||
|
search_url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||||
|
search = read_url(search_url)
|
||||||
|
urls = re.compile('(http://www.metacritic.com/tv/shows/.*?)\?').findall(search)
|
||||||
|
if urls:
|
||||||
|
return urls[0]
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def scrapeMetacritic(title, url=None):
|
||||||
|
if not url:
|
||||||
|
url = getMetacriticShowUrl(title)
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
data = read_url(url)
|
||||||
|
score = re.compile('''ALT="Metascore: (.*?)"''').findall(data)
|
||||||
|
if score:
|
||||||
|
score = int(score[0])
|
||||||
|
else:
|
||||||
|
score = -1
|
||||||
|
|
||||||
|
reviews = re.compile('(<DIV CLASS="scoreandreview">.*)').findall(data.replace('\n',''))[0]
|
||||||
|
|
||||||
|
reg = '''<DIV CLASS="scoreandreview"><DIV CLASS="criticscore">(.*?)</DIV>.*?
|
||||||
|
<SPAN CLASS="publication">(.*?)
|
||||||
|
<DIV CLASS="quote">(.*?)
|
||||||
|
</DIV>.*?</DIV>.*?</DIV>
|
||||||
|
'''.replace('\n','')
|
||||||
|
reviews = re.compile(reg).findall(reviews)
|
||||||
|
|
||||||
|
metacritics = []
|
||||||
|
for r in reviews:
|
||||||
|
critic ={}
|
||||||
|
critic['score'] = int(r[0].strip())
|
||||||
|
publication = r[1].split('</SPAN>')
|
||||||
|
criticname = ''
|
||||||
|
if len(publication) > 1:
|
||||||
|
criticname = publication[1].replace('<SPAN CLASS="criticname">','').strip()
|
||||||
|
publication = publication[0]
|
||||||
|
critic['publication'] = publication
|
||||||
|
critic['critic'] = criticname
|
||||||
|
quote = r[2].split('<BR>')
|
||||||
|
link = ''
|
||||||
|
if len(quote) > 1:
|
||||||
|
link = re.compile('<A HREF="(.*?)" TARGET="_blank"><IMG SRC="/_images/readreview.gif"').findall(quote[1])[0]
|
||||||
|
quote = quote[0].strip()
|
||||||
|
critic['quote'] = quote
|
||||||
|
critic['link'] = link
|
||||||
|
metacritics.append(critic)
|
||||||
|
|
||||||
|
return dict(score = score, critics = metacritics, url = url)
|
||||||
|
|
||||||
|
|
|
@ -2,79 +2,13 @@
|
||||||
# -*- Mode: Python; -*-
|
# -*- Mode: Python; -*-
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
|
||||||
from os.path import *
|
|
||||||
import sys
|
|
||||||
import datetime
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
from urllib2 import urlopen
|
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
import Image
|
|
||||||
import StringIO
|
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
|
|
||||||
from utils import read_url
|
from utils import read_url
|
||||||
|
|
||||||
|
|
||||||
hr_hdtv = re.compile('HR HDTV')
|
|
||||||
hdtv = re.compile('HDTV')
|
|
||||||
|
|
||||||
def get_url(title):
|
|
||||||
return title.replace(' ','_').replace('/', '_').lower()
|
|
||||||
|
|
||||||
def get_show(string):
|
|
||||||
return string.split(';')[0].split(':')[1].strip()
|
|
||||||
|
|
||||||
def get_title(string):
|
|
||||||
title = string.split(';')[1].split(':')[1].strip()
|
|
||||||
if title != 'n/a':
|
|
||||||
return title
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def get_season(string):
|
|
||||||
try:
|
|
||||||
season = int(string.split(';')[2].split(':')[1].strip())
|
|
||||||
except:
|
|
||||||
return None
|
|
||||||
return season
|
|
||||||
|
|
||||||
def get_episode(string):
|
|
||||||
try:
|
|
||||||
episode = int(string.split(';')[3].split(':')[1].strip())
|
|
||||||
except:
|
|
||||||
return None
|
|
||||||
return episode
|
|
||||||
|
|
||||||
def get_episodedate(string):
|
|
||||||
s = string.split('Episode Date:')
|
|
||||||
if len(s) == 2:
|
|
||||||
return s[1].strip()
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_episode_string(string):
|
|
||||||
episode = get_episode(string)
|
|
||||||
season = get_season(string)
|
|
||||||
episodedate = get_episodedate(string)
|
|
||||||
estring = None
|
|
||||||
if season and episode:
|
|
||||||
estring = "S%02dE%02d" % (season, episode)
|
|
||||||
elif episodedate:
|
|
||||||
estring = episodedate
|
|
||||||
return estring
|
|
||||||
|
|
||||||
def choose_item(old, new):
|
|
||||||
if old['link'] == new['link']:
|
|
||||||
return False
|
|
||||||
if not hdtv.search(old['title']):
|
|
||||||
if hdtv.search(new['title']):
|
|
||||||
display_item(new)
|
|
||||||
log.debug("vs.")
|
|
||||||
display_item(old)
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_episodes(show_title):
|
def get_episodes(show_title):
|
||||||
search_url = "http://tvrss.net/search/index.php?distribution_group=combined&show_name=%s&show_name_exact=true&filename=&date=&quality=&release_group=&mode=rss" % quote(show_title)
|
search_url = "http://tvrss.net/search/index.php?distribution_group=combined&show_name=%s&show_name_exact=true&filename=&date=&quality=&release_group=&mode=rss" % quote(show_title)
|
||||||
data = read_url(search_url)
|
data = read_url(search_url)
|
||||||
|
@ -84,162 +18,3 @@ def get_episodes(show_title):
|
||||||
episode = get_episode_string(t['summary'])
|
episode = get_episode_string(t['summary'])
|
||||||
episodes[episode] = t['enclosures'][0]['href']
|
episodes[episode] = t['enclosures'][0]['href']
|
||||||
return episodes
|
return episodes
|
||||||
|
|
||||||
def get_thumbnail(url):
|
|
||||||
try:
|
|
||||||
thumbnail = read_url(url)
|
|
||||||
im = Image.open(StringIO.StringIO(thumbnail))
|
|
||||||
out = StringIO.StringIO()
|
|
||||||
width = 100
|
|
||||||
height = int((100.0 / im.size[0]) * im.size[1])
|
|
||||||
im = im.resize((width, height))
|
|
||||||
im.crop((0,0,100,100)).convert().save(out, 'JPEG')
|
|
||||||
thumbnail = out.getvalue()
|
|
||||||
except:
|
|
||||||
thumbnail = None
|
|
||||||
return thumbnail
|
|
||||||
|
|
||||||
def get_imdbdata(imdbid):
|
|
||||||
thumbnail = None
|
|
||||||
description=''
|
|
||||||
i = imdb.IMDb(imdbid).parse()
|
|
||||||
if i:
|
|
||||||
poster = i['poster']
|
|
||||||
if poster != 'http://i.imdb.com/Heads/npa.gif':
|
|
||||||
log.debug("getting poster %s" % poster)
|
|
||||||
thumbnail = get_thumbnail(poster)
|
|
||||||
if i['plot']:
|
|
||||||
description=i['plot']
|
|
||||||
elif i['plot_outline']:
|
|
||||||
description=i['plot_outline']
|
|
||||||
else:
|
|
||||||
description=i['tagline']
|
|
||||||
|
|
||||||
return (i, description, thumbnail)
|
|
||||||
else:
|
|
||||||
return(i, '', None)
|
|
||||||
|
|
||||||
|
|
||||||
def load():
|
|
||||||
log.debug("getting new shows from tvrss...")
|
|
||||||
feed = feedparser.parse('http://tvrss.net/feed/combined/')
|
|
||||||
shows = {}
|
|
||||||
for item in feed['entries']:
|
|
||||||
show = get_show(item['description'])
|
|
||||||
estring = get_episode_string(item['description'])
|
|
||||||
if estring:
|
|
||||||
if show and not hr_hdtv.search(item['title']):
|
|
||||||
if shows.has_key(show):
|
|
||||||
if shows[show].has_key(estring):
|
|
||||||
if choose_item(shows[show][estring], item):
|
|
||||||
shows[show][estring] = item
|
|
||||||
else:
|
|
||||||
shows[show][estring] = item
|
|
||||||
else:
|
|
||||||
shows[show] = {}
|
|
||||||
shows[show][estring] = item
|
|
||||||
for show in shows:
|
|
||||||
imdb = None
|
|
||||||
try:
|
|
||||||
model.ShowsBlacklist.byShowUrl(get_url(show))
|
|
||||||
log.debug("ignoring blacklisted show %s" % show)
|
|
||||||
continue
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
s = None
|
|
||||||
try:
|
|
||||||
s = model.Shows.byUrl(get_url(show))
|
|
||||||
except SQLObjectNotFound:
|
|
||||||
try:
|
|
||||||
alias = model.ShowsAlias.byAlias(get_url(show))
|
|
||||||
s = alias.show
|
|
||||||
except SQLObjectNotFound:
|
|
||||||
s = None
|
|
||||||
if not s:
|
|
||||||
log.debug("about to add %s" % show)
|
|
||||||
thumbnail = None
|
|
||||||
description=''
|
|
||||||
ur = '-'
|
|
||||||
try:
|
|
||||||
imdbid = IMDb.guess(show)
|
|
||||||
if imdbid:
|
|
||||||
imdb, description, thumbnail = get_imdbdata(imdbid)
|
|
||||||
if imdb:
|
|
||||||
ur = imdb['rating']
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
print ptraceback.print_exc()
|
|
||||||
pass
|
|
||||||
s= model.Shows(
|
|
||||||
title = show,
|
|
||||||
url = get_url(show),
|
|
||||||
description = description,
|
|
||||||
imdb = imdbid,
|
|
||||||
imdbUserRating = ur
|
|
||||||
)
|
|
||||||
s.thumbnail = thumbnail
|
|
||||||
meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
|
|
||||||
if meta:
|
|
||||||
s.metacriticUrl = meta['url']
|
|
||||||
s.metacriticScore = "%s" % meta['score']
|
|
||||||
for review in meta['critics']:
|
|
||||||
model.addReview(s, review)
|
|
||||||
model.hub.commit()
|
|
||||||
log.debug('added %s' % show)
|
|
||||||
for episode in shows[show]:
|
|
||||||
episode_title = get_title(shows[show][episode]['description'])
|
|
||||||
episode_description = ''
|
|
||||||
episode_imdb = ''
|
|
||||||
q = model.Episodes.select(AND(
|
|
||||||
model.Episodes.q.showID == s.id,
|
|
||||||
model.Episodes.q.episode == episode))
|
|
||||||
if q.count() == 0:
|
|
||||||
if not imdb:
|
|
||||||
try:
|
|
||||||
imdbid = IMDb.guess(show)
|
|
||||||
if imdbid:
|
|
||||||
imdb = IMDb.parse(imdbid)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
if imdb and imdb['episodes'].has_key(episode):
|
|
||||||
episode_title = imdb['episodes'][episode]['title']
|
|
||||||
episode_description = imdb['episodes'][episode]['description']
|
|
||||||
episode_imdb = imdb['episodes'][episode]['imdb']
|
|
||||||
if not episode_description or not episode_title:
|
|
||||||
tvcom_data = tvcom.get(show, episode)
|
|
||||||
if not episode_description:
|
|
||||||
episode_description = tvcom_data['description']
|
|
||||||
if not episode_title:
|
|
||||||
episode_title = tvcom_data['title']
|
|
||||||
e = model.Episodes(
|
|
||||||
showID = s.id,
|
|
||||||
title = episode_title,
|
|
||||||
episode = episode,
|
|
||||||
torrent = shows[show][episode]['enclosures'][0]['href'],
|
|
||||||
description = episode_description,
|
|
||||||
imdb = episode_imdb,
|
|
||||||
thumbnail = None,
|
|
||||||
pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
|
|
||||||
)
|
|
||||||
s.lastUpdate = datetime.datetime.now()
|
|
||||||
model.hub.commit()
|
|
||||||
log.debug("from tvrss add %s %s" %(episode, show))
|
|
||||||
log.debug("updating tvrss done.")
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# first look on the command line for a desired config file,
|
|
||||||
# if it's not on the command line, then
|
|
||||||
# look for setup.py in this directory. If it's not there, this script is
|
|
||||||
# probably installed
|
|
||||||
if len(sys.argv) > 1:
|
|
||||||
turbogears.update_config(configfile=sys.argv[1],
|
|
||||||
modulename="btvcr.config")
|
|
||||||
elif exists(join(dirname(__file__), "setup.py")):
|
|
||||||
turbogears.update_config(configfile="dev.cfg",
|
|
||||||
modulename="btvcr.config")
|
|
||||||
else:
|
|
||||||
turbogears.update_config(configfile="prod.cfg",
|
|
||||||
modulename="btvcr.config")
|
|
||||||
|
|
||||||
from btvcr.controllers import Root
|
|
||||||
load()
|
|
||||||
|
|
|
@ -126,7 +126,8 @@ def html_entity_decode(s, encoding = 'utf-8'):
|
||||||
|
|
||||||
def stripTags(s):
|
def stripTags(s):
|
||||||
if s:
|
if s:
|
||||||
return djangohtml.strip_tags(htmldecode(s)).strip()
|
s = htmldecode(s)
|
||||||
|
return djangohtml.strip_tags(s).strip()
|
||||||
return u''
|
return u''
|
||||||
|
|
||||||
strip_tags=stripTags
|
strip_tags=stripTags
|
||||||
|
@ -140,7 +141,7 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
||||||
def htmldecode(text):
|
def htmldecode(text):
|
||||||
"""Decode HTML entities in the given text."""
|
"""Decode HTML entities in the given text."""
|
||||||
if type(text) != unicode:
|
if type(text) != unicode:
|
||||||
text = unicode(text)
|
text = unicode(text)[:]
|
||||||
if type(text) is unicode:
|
if type(text) is unicode:
|
||||||
uchr = unichr
|
uchr = unichr
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in a new issue