scrapeit/scrapeit/epguides.py

69 lines
1.8 KiB
Python

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import re
from BeautifulSoup import BeautifulSoup
from google import google
from utils import read_url, read_url_utf8, stripTags
import tvcom
import imdb
def epguidesUrl(title):
'''
Search Epguide Url for Show via Show Title.
Use Google to search the url, this is also done on Epguide.
'''
for (name, url, desc) in google('allintitle: site:epguides.com %s' % title, 1):
if url.startswith('http://epguides.com'):
if re.search(title, name):
return url
return None
def getShowImdb(title):
imdbid = None
url = epguidesUrl(title)
if url:
data = read_url(url)
soup = BeautifulSoup(data)
links = soup('a', {'href': re.compile('imdb.com/title/tt')})
if links:
link = links[0].get('href')
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
if not imdbid:
imdbid = imdb.guess(title)
return imdbid
def getEpisodeData(title, episode, show_url = None):
'''
Collect information about an episode.
Returns dict with title, show, description and episode
'''
episodeData = {
'title': u'',
'show': title,
'description': u'',
'episode': episode,
}
description = u''
data = u''
if not show_url:
show_url = epguidesUrl(title)
if show_url:
data = read_url_utf8(show_url)
else:
return imdb.getEpisodeData(title, episode)
estring = u'' +episode.replace('S','').replace('E','-').replace('0',' ').strip()
for line in data.split('\n'):
a = line.split(estring)
if len(a) == 2:
soup = BeautifulSoup(line)
episodeData['title'] = soup('a')[0].contents[0]
tvcom_url = soup('a')[0].get('href')
episodeData['description'] = tvcom.getEpisodeData(tvcom_url)['description']
break
return episodeData