
482 lines
15 KiB
Raw Normal View History

2007-03-01 15:11:35 +00:00
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import urllib2
from urllib import quote
import re, time
import os
from elementtree.ElementTree import parse, tostring
from BeautifulSoup import BeautifulSoup
from google import google
from utils import stripTags, read_url_utf8, htmldecode
import utils
def read_url(url):
base = "/var/cache/scrapeit/cache/"
path = os.path.join(base, url.replace('http://',''))
if path.endswith('/'):
path = "%sindex.html" % path
if os.path.isdir(path):
path = "%s/index.html" % path
if os.path.exists(path):
f = open(path)
data =
return data
data = utils.read_url(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
f = open(path, 'w')
return data
def _get_data(url):
data = None
data = read_url(url)
print "error reading data from", url
return data
def get_image(url):
return read_url(url)
def _castList(data, regexp):
soup = re.compile(regexp).findall(data)
if soup:
soup = BeautifulSoup(soup[0])
names = []
for i in soup('a', {'href': re.compile('/name/nm')}):
if i.string:
cast = stripTags(i.string)
if cast not in names:
return names
return []
def _getTerm(data, regexp):
term = ''
reg = re.compile(regexp, re.IGNORECASE)
m =
if m:
term = stripTags(
print "waring, parsing failed for", regexp
return term.encode('utf8')
class IMDb:
def __init__(self, imdb): = imdb
self.pageSource = None
self.pageUrl = "" %
self.businessSource = None
self.businessUrl = "%sbusiness" % self.pageUrl
self.connectionsSource = None
self.connectionsUrl = "%smovieconnections" % self.pageUrl
self.creditsSource = None
self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesSource = None
self.episodesUrl = "%sepisodes" % self.pageUrl
self.keywordSource = None
self.keywordUrl = "%skeywords" % self.pageUrl
self.plotSource = None
self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoSource = None
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.triviaSource = None
self.triviaUrl = "%strivia" % self.pageUrl
def getPage(self, forcereload = False):
if forcereload or not self.pageSource:
2007-06-16 16:10:34 +00:00
self.pageSource = read_url_utf8(self.pageUrl)
2007-03-01 15:11:35 +00:00
return self.pageSource
def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
2007-06-16 16:10:34 +00:00
value = unicode(value, 'utf-8')
2007-03-01 15:11:35 +00:00
value = stripTags(value).strip()
if key == 'runtime':
parsed_value = _getTerm(value, '(.*?) min')
parsed_value = _getTerm(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = _getTerm(value, '(.*?) sec')
parsed_value = _getTerm(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
parsed_value = int(parsed_value)
parsed_value = int(parsed_value) * 60
elif key in ('country', 'language'):
parsed_value = value.split(' / ')
elif key == 'genre':
parsed_value = value.replace('more', '').strip().split(' / ')
elif key == 'tagline':
parsed_value = value.replace('more', '').strip()
elif key == 'plot_outline':
parsed_value = value.replace('(view trailer)', '').strip()
if parsed_value.endswith('more'):
parsed_value = parsed_value[:-4].strip()
elif key == 'tv_series':
m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
if m:
parsed_value = m[0][0]
parsed_value = ''
2007-04-12 11:34:57 +00:00
elif key == 'also_known_as':
parsed_value = ''
m = re.compile('(.*) \(International: English title').findall(value)
if m:
parsed_value = m[0]
m = re.compile('(.*) \(USA').findall(value)
if m:
parsed_value = m[0]
parsed_value = parsed_value.split('<br />')[-1].split('(')[0].strip()
2007-03-01 15:11:35 +00:00
print value
parsed_value = value
return parsed_value
2007-06-22 14:14:29 +00:00
def parseTitle(self):
title = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
title = stripTags(html_title)
title = re.sub('\(\d\d\d\d\)', '', title)
2007-06-28 11:01:57 +00:00
title = re.sub('\(\d\d\d\d/I\)', '', title)
2007-06-28 11:01:00 +00:00
for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
2007-06-28 11:00:12 +00:00
title = title.replace(t, '')
2007-06-28 11:04:30 +00:00
title = title.strip()
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
return title
2007-06-22 14:14:29 +00:00
def parseYear(self):
year = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = stripTags(html_title)
year = re.compile('\((\d\d\d\d)\)').findall(html_title)
2007-06-28 11:00:12 +00:00
if not year:
2007-06-28 12:28:18 +00:00
year = re.compile('\((\d\d\d\d)/').findall(html_title)
2007-06-22 14:14:29 +00:00
if year:
year = year[0]
else: year = ''
return year
2007-03-01 15:11:35 +00:00
def parse(self):
data = self.getPage()
IMDbDict ={}
IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
if not IMDbDict['poster']:
IMDbDict['poster'] = ''
#Title, Year
2007-06-22 14:14:29 +00:00
IMDbDict['year'] = self.parseYear()
IMDbDict['title'] = self.parseTitle()
2007-03-01 15:11:35 +00:00
2007-03-02 16:47:28 +00:00
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
2007-03-01 15:11:35 +00:00
if m:
IMDbDict['rating'] = int(float( * 1000)
IMDbDict['rating'] = -1
2007-03-02 16:47:28 +00:00
2007-03-02 20:44:43 +00:00
m = re.compile('<small>\(<a href="ratings">(.*?) votes</a>\)</small>', re.IGNORECASE).findall(data)
2007-03-02 16:47:28 +00:00
if m:
2007-03-02 20:44:43 +00:00
IMDbDict['votes'] = int(m[0].replace(',', ''))
2007-03-02 16:47:28 +00:00
2007-03-01 15:11:35 +00:00
IMDbDict['votes'] = -1
data = data.replace('\n',' ')
#some values
2007-04-12 11:34:57 +00:00
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
2007-03-01 15:11:35 +00:00
for key in keys:
IMDbDict[key] = ''
IMDbDict['runtime'] = 0
soup = BeautifulSoup(data)
for info in soup('div', {'class': 'info'}):
key = str(info).split('</h5>')[0].split('<h5>')
if len(key) > 1:
raw_value = str(info).split('</h5>')[1]
key = key[1][:-1].lower().replace(' ', '_')
if key in keys:
IMDbDict[key] = self.parse_raw_value(key, raw_value)
2007-04-12 11:34:57 +00:00
IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
2007-03-01 15:11:35 +00:00
#is episode
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
IMDbDict['episodes'] = self.parseEpisodes()
IMDbDict['credits'] = self.parseCredits()
IMDbDict['plot'] = self.parsePlot()
IMDbDict['keywords'] = self.parseKeywords()
IMDbDict['trivia'] = self.parseTrivia()
IMDbDict['connections'] = self.parseConnections()
IMDbDict['release_date'] = self.parseReleaseinfo()
IMDbDict['business'] = self.parseBusiness()
self.IMDbDict = IMDbDict
return self.IMDbDict
def getCredits(self, forcereload = False):
if forcereload or not self.creditsSource:
2007-06-22 15:09:34 +00:00
self.creditsSource = read_url_utf8(self.creditsUrl)
2007-03-01 15:11:35 +00:00
return self.creditsSource
def parseCredits(self):
data = self.getCredits()
credits = {}
credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
#credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
credits['cast'] = []
soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
soup = BeautifulSoup(data)
cast = soup('table', {'class': 'cast'})
if cast:
cast = str(cast[0])
names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
for name in names:
real_name = name[0]
role_name = name[1]
if role_name:
role_name = role_name.split('(')[0].replace('/ ...','').strip()
credits['cast'].append((stripTags(real_name), stripTags(role_name)))
self.credits = credits
return self.credits
def getPlot(self, forcereload = False):
if forcereload or not self.plotSource:
2007-06-22 15:09:34 +00:00
self.plotSource = read_url_utf8(self.plotUrl)
2007-03-01 15:11:35 +00:00
return self.plotSource
def parsePlot(self):
soup = BeautifulSoup(self.getPlot())
plot = soup('p', {'class':'plotpar'})
if plot:
2007-06-22 15:09:34 +00:00
plot = unicode(plot[0]).split('<i>')[0]
2007-03-01 15:11:35 +00:00
plot = u''
plot = stripTags(plot).strip()
self.plot = plot
return plot
def getEpisodes(self, forcereload = False):
if forcereload or not self.episodesSource:
2007-06-22 15:09:34 +00:00
self.episodesSource = read_url_utf8(self.episodesUrl)
2007-03-01 15:11:35 +00:00
return self.episodesSource
def parseEpisodes(self):
episodes = {}
cdata = self.getEpisodes().replace('\r\n',' ')
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>'''
#regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
reg = re.compile(regexp, re.IGNORECASE)
m = reg.findall(cdata)
for match in m:
episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
episodes[episode] = {}
episodes[episode]['imdb'] = match[2]
episodes[episode]['title'] = match[3].strip()
description = htmldecode(match[4])
description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description
import traceback
print traceback.print_exc()
self.episodes = episodes
return self.episodes
def getKeywords(self, forcereload = False):
if forcereload or not self.keywordSource:
2007-06-22 15:09:34 +00:00
self.keywordSource = read_url_utf8(self.keywordUrl)
2007-03-01 15:11:35 +00:00
return self.keywordSource
def parseKeywords(self):
soup = BeautifulSoup(self.getKeywords())
keywords = []
2007-07-12 19:06:12 +00:00
for key in soup('a', {'href': re.compile('^/keyword/')}):
2007-03-01 15:11:35 +00:00
self.keywords = keywords
return self.keywords
def getTrivia(self, forcereload = False):
if forcereload or not self.triviaSource:
2007-06-22 15:09:34 +00:00
self.triviaSource = read_url_utf8(self.triviaUrl)
2007-03-01 15:11:35 +00:00
return self.triviaSource
def parseTrivia(self):
trivia = []
soup = BeautifulSoup(self.getTrivia())
triviaList = []
for i in soup('ul', {'class': "trivia"}):
for t in i('li'):
t = str(t).replace('<br />', '').strip()
if t.startswith('<li>') and t.endswith('</li>'):
t = t[4:-5].strip()
self.trivia = trivia
return self.trivia
def getConnections(self, forcereload = False):
if forcereload or not self.connectionsSource:
2007-06-22 15:09:34 +00:00
self.connectionsSource = read_url_utf8(self.connectionsUrl)
2007-03-01 15:11:35 +00:00
return self.connectionsSource
def parseConnections(self):
connections = {}
soup = BeautifulSoup(self.getConnections())
content = soup('div', {'id': 'tn15content'})[0]
blocks = str(content).split('<h5>')[1:]
for c in blocks:
connection = c.split('</h5>')[0]
cs = BeautifulSoup(c)
if connection:
#relation -> list of imdb ids
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
return connections
def getReleaseinfo(self, forcereload = False):
if forcereload or not self.releaseinfoSource:
2007-06-22 15:09:34 +00:00
self.releaseinfoSource = read_url_utf8(self.releaseinfoUrl)
2007-03-01 15:11:35 +00:00
return self.releaseinfoSource
def parseReleaseinfo(self):
soup = BeautifulSoup(self.getReleaseinfo())
for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'):
d = row('td', {'align':'right'})
if d:
possible_date = stripTags(str(d[0])).strip()
rdate = time.strptime(possible_date, "%d %B %Y")
rdate = time.strftime('%Y-%m-%d', rdate)
return rdate
return None
def getBusiness(self, forcereload = False):
if forcereload or not self.businessSource:
2007-06-22 15:09:34 +00:00
self.businessSource = read_url_utf8(self.businessUrl)
2007-03-01 15:11:35 +00:00
return self.businessSource
def parseBusiness(self):
soup = BeautifulSoup(self.getBusiness())
business = {'budget': 0, 'gross': 0, 'profit': 0}
content = soup('div', {'id': 'tn15content'})[0]
blocks = str(content).split('<h5>')[1:]
for c in blocks:
cs = BeautifulSoup(c)
line = c.split('</h5>')
if line:
title = line[0]
line = line[1]
if title in ['Budget', 'Gross']:
values = re.compile('\$(.*?) ').findall(line)
values = [int(value.replace(',','')) for value in values]
if values:
business[title.lower()] = max(values)
if business['budget'] and business['gross']:
business['profit'] = business['gross'] - business['budget']
return business
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]
title = title.split('(')[0]
title = title.split('.')[0]
title = title.strip()
imdb_url = '' % quote(title.encode('utf-8'))
return_url = ''
#lest first try google
#i.e. Michael Stevens Sin
if director:
search = ' %s "%s"' % (director, title)
search = ' "%s"' % title
for (name, url, desc) in google(search, 1):
if url.startswith(''):
return url[28:35]
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data =
return_url = u.url
return None
2007-03-01 15:11:35 +00:00
if return_url.startswith(''):
return return_url[28:35]
if data:
imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id:
return imdb_id
imdb_url = ';s=tt;site=aka' % quote(title.encode('utf-8'))
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data =
return_url = u.url
if return_url.startswith(''):
return return_url[28:35]
return None
def getEpisodeData(title, episode, show_url = None):
Collect information about an episode.
Returns dict with title, show, description and episode
episodeData = {
'title': u'',
'show': title,
'description': u'',
'episode': episode,
description = u''
if not show_url:
imdbid = guess(title)
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
if imdbid:
i = IMDb(imdbid).parse()
episodeData['title'] = i['episodes'][episode]['title']
episodeData['description'] = i['episodes'][episode]['description']
episodeData['imdb'] = i['episodes'][episode]['imdb']
return episodeData
if __name__ == '__main__':
import sys
#print parse(sys.argv[1])
print "imdb:", guess(sys.argv[1])