scrapeit/scrapeit/imdb.py
2007-07-29 20:46:03 +00:00

511 lines
16 KiB
Python

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import urllib2
from urllib import quote
import re, time
import os
from elementtree.ElementTree import parse, tostring
from BeautifulSoup import BeautifulSoup
from google import google
from utils import stripTags, htmldecode
import utils
import chardet
cache_base = "/var/cache/scrapeit/cache/"
def read_url_utf8(url):
data = read_url(url)
encoding = chardet.detect(data)['encoding']
if not encoding: encoding = 'latin-1'
data = unicode(data, encoding)
return data
def read_url(url):
path = os.path.join(cache_base, url.replace('http://',''))
if path.endswith('/'):
path = "%sindex.html" % path
if os.path.isdir(path):
path = "%s/index.html" % path
if os.path.exists(path):
f = open(path)
data = f.read()
f.close()
return data
else:
data = utils.read_url(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(path, 'w')
f.write(data)
f.close()
return data
def _get_data(url):
data = None
try:
data = read_url(url)
except:
print "error reading data from", url
return data
def get_image(url):
return read_url(url)
def _castList(data, regexp):
soup = re.compile(regexp).findall(data)
if soup:
soup = BeautifulSoup(soup[0])
names = []
for i in soup('a', {'href': re.compile('/name/nm')}):
if i.string:
cast = stripTags(i.string)
if cast not in names:
names.append(cast)
return names
return []
def _getTerm(data, regexp):
term = ''
try:
reg = re.compile(regexp, re.IGNORECASE)
m = reg.search(data)
if m:
term = stripTags(m.group(1)).strip()
except:
print "waring, parsing failed for", regexp
return term.encode('utf8')
class IMDb:
def __init__(self, imdb):
self.imdb = imdb
self.pageSource = None
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
self.businessSource = None
self.businessUrl = "%sbusiness" % self.pageUrl
self.connectionsSource = None
self.connectionsUrl = "%smovieconnections" % self.pageUrl
self.creditsSource = None
self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesSource = None
self.episodesUrl = "%sepisodes" % self.pageUrl
self.keywordSource = None
self.keywordUrl = "%skeywords" % self.pageUrl
self.plotSource = None
self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoSource = None
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.triviaSource = None
self.triviaUrl = "%strivia" % self.pageUrl
self.locationSource = None
self.locationUrl = "%slocations" % self.pageUrl
def getPage(self, forcereload = False):
if forcereload or not self.pageSource:
self.pageSource = read_url_utf8(self.pageUrl)
return self.pageSource
def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
value = unicode(value, 'utf-8')
value = stripTags(value).strip()
if key == 'runtime':
parsed_value = _getTerm(value, '(.*?) min')
parsed_value = _getTerm(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = _getTerm(value, '(.*?) sec')
parsed_value = _getTerm(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
else:
parsed_value = int(parsed_value)
else:
parsed_value = int(parsed_value) * 60
elif key in ('country', 'language'):
parsed_value = value.split(' / ')
elif key == 'genre':
parsed_value = value.replace('more', '').strip().split(' / ')
elif key == 'tagline':
parsed_value = value.replace('more', '').strip()
elif key == 'plot_outline':
parsed_value = value.replace('(view trailer)', '').strip()
if parsed_value.endswith('more'):
parsed_value = parsed_value[:-4].strip()
elif key == 'tv_series':
m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
if m:
parsed_value = m[0][0]
else:
parsed_value = ''
elif key == 'also_known_as':
parsed_value = ''
m = re.compile('(.*) \(International: English title').findall(value)
if m:
parsed_value = m[0]
else:
m = re.compile('(.*) \(USA').findall(value)
if m:
parsed_value = m[0]
parsed_value = parsed_value.split('<br />')[-1].split('(')[0].strip()
else:
print value
parsed_value = value
return parsed_value
def parseTitle(self):
title = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = stripTags(html_title)
title = re.sub('\(\d\d\d\d\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title)
for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')]
title = title.strip()
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
return title
def parseYear(self):
year = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = stripTags(html_title)
year = re.compile('\((\d\d\d\d)\)').findall(html_title)
if not year:
year = re.compile('\((\d\d\d\d)/').findall(html_title)
if year:
year = year[0]
else: year = ''
return year
def parse(self):
data = self.getPage()
IMDbDict ={}
#Poster
IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
#Title, Year
IMDbDict['year'] = self.parseYear()
IMDbDict['title'] = self.parseTitle()
#Rating
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
if m:
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
else:
IMDbDict['rating'] = -1
#Votes
m = re.compile('<small>\(<a href="ratings">(.*?) votes</a>\)</small>', re.IGNORECASE).findall(data)
if m:
IMDbDict['votes'] = int(m[0].replace(',', ''))
else:
IMDbDict['votes'] = -1
data = data.replace('\n',' ')
#some values
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
for key in keys:
IMDbDict[key] = ''
IMDbDict['runtime'] = 0
soup = BeautifulSoup(data)
for info in soup('div', {'class': 'info'}):
key = str(info).split('</h5>')[0].split('<h5>')
if len(key) > 1:
raw_value = str(info).split('</h5>')[1]
key = key[1][:-1].lower().replace(' ', '_')
if key in keys:
IMDbDict[key] = self.parse_raw_value(key, raw_value)
IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
#is episode
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
IMDbDict['episodes'] = self.parseEpisodes()
IMDbDict['credits'] = self.parseCredits()
IMDbDict['plot'] = self.parsePlot()
IMDbDict['keywords'] = self.parseKeywords()
IMDbDict['trivia'] = self.parseTrivia()
IMDbDict['connections'] = self.parseConnections()
IMDbDict['locations'] = self.parseLocations()
IMDbDict['release_date'] = self.parseReleaseinfo()
IMDbDict['business'] = self.parseBusiness()
self.IMDbDict = IMDbDict
return self.IMDbDict
def getCredits(self, forcereload = False):
if forcereload or not self.creditsSource:
self.creditsSource = read_url_utf8(self.creditsUrl)
return self.creditsSource
def parseCredits(self):
data = self.getCredits()
credits = {}
credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
#credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
credits['cast'] = []
soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
soup = BeautifulSoup(data)
cast = soup('table', {'class': 'cast'})
if cast:
cast = str(cast[0])
names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
for name in names:
real_name = name[0]
role_name = name[1]
if role_name:
role_name = role_name.split('(')[0].replace('/ ...','').strip()
credits['cast'].append((stripTags(real_name), stripTags(role_name)))
self.credits = credits
return self.credits
def getPlot(self, forcereload = False):
if forcereload or not self.plotSource:
self.plotSource = read_url_utf8(self.plotUrl)
return self.plotSource
def parsePlot(self):
soup = BeautifulSoup(self.getPlot())
plot = soup('p', {'class':'plotpar'})
if plot:
plot = unicode(plot[0]).split('<i>')[0]
else:
plot = u''
plot = stripTags(plot).strip()
self.plot = plot
return plot
def getEpisodes(self, forcereload = False):
if forcereload or not self.episodesSource:
self.episodesSource = read_url_utf8(self.episodesUrl)
return self.episodesSource
def parseEpisodes(self):
episodes = {}
cdata = self.getEpisodes().replace('\r\n',' ')
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>'''
#regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
reg = re.compile(regexp, re.IGNORECASE)
m = reg.findall(cdata)
for match in m:
try:
episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
episodes[episode] = {}
episodes[episode]['imdb'] = match[2]
episodes[episode]['title'] = match[3].strip()
description = htmldecode(match[4])
description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description
except:
import traceback
print traceback.print_exc()
pass
self.episodes = episodes
return self.episodes
def getLocations(self, forcereload = False):
if forcereload or not self.locationSource:
self.keywordSource = read_url_utf8(self.locationUrl)
return self.keywordSource
def parseLocations(self):
soup = BeautifulSoup(self.getLocations())
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string))
self.locations = locations
return self.locations
def getKeywords(self, forcereload = False):
if forcereload or not self.keywordSource:
self.keywordSource = read_url_utf8(self.keywordUrl)
return self.keywordSource
def parseKeywords(self):
soup = BeautifulSoup(self.getKeywords())
keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}):
k = htmldecode(key.string)
k = k.replace(u'\xa0', ' ')
keywords.append(k)
self.keywords = keywords
return self.keywords
def getTrivia(self, forcereload = False):
if forcereload or not self.triviaSource:
self.triviaSource = read_url_utf8(self.triviaUrl)
return self.triviaSource
def parseTrivia(self):
trivia = []
soup = BeautifulSoup(self.getTrivia())
triviaList = []
for i in soup('ul', {'class': "trivia"}):
for t in i('li'):
t = str(t).replace('<br />', '').strip()
if t.startswith('<li>') and t.endswith('</li>'):
t = t[4:-5].strip()
trivia.append(t)
self.trivia = trivia
return self.trivia
def getConnections(self, forcereload = False):
if forcereload or not self.connectionsSource:
self.connectionsSource = read_url_utf8(self.connectionsUrl)
return self.connectionsSource
def parseConnections(self):
connections = {}
soup = BeautifulSoup(self.getConnections())
content = soup('div', {'id': 'tn15content'})[0]
blocks = str(content).split('<h5>')[1:]
for c in blocks:
connection = c.split('</h5>')[0]
cs = BeautifulSoup(c)
if connection:
#relation -> list of imdb ids
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
return connections
def getReleaseinfo(self, forcereload = False):
if forcereload or not self.releaseinfoSource:
self.releaseinfoSource = read_url_utf8(self.releaseinfoUrl)
return self.releaseinfoSource
def parseReleaseinfo(self):
soup = BeautifulSoup(self.getReleaseinfo())
for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'):
d = row('td', {'align':'right'})
if d:
try:
possible_date = stripTags(str(d[0])).strip()
rdate = time.strptime(possible_date, "%d %B %Y")
rdate = time.strftime('%Y-%m-%d', rdate)
return rdate
except:
pass
return None
def getBusiness(self, forcereload = False):
if forcereload or not self.businessSource:
self.businessSource = read_url_utf8(self.businessUrl)
return self.businessSource
def parseBusiness(self):
soup = BeautifulSoup(self.getBusiness())
business = {'budget': 0, 'gross': 0, 'profit': 0}
content = soup('div', {'id': 'tn15content'})[0]
blocks = str(content).split('<h5>')[1:]
for c in blocks:
cs = BeautifulSoup(c)
line = c.split('</h5>')
if line:
title = line[0]
line = line[1]
if title in ['Budget', 'Gross']:
values = re.compile('\$(.*?) ').findall(line)
values = [int(value.replace(',','')) for value in values]
if values:
business[title.lower()] = max(values)
if business['budget'] and business['gross']:
business['profit'] = business['gross'] - business['budget']
return business
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]
title = title.split('(')[0]
title = title.split('.')[0]
title = title.strip()
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
return_url = ''
#lest first try google
#i.e. site:imdb.com Michael Stevens Sin
if director:
search = 'site:imdb.com %s "%s"' % (director, title)
else:
search = 'site:imdb.com "%s"' % title
for (name, url, desc) in google(search, 1):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
try:
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
u.close()
except:
return None
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
if data:
imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id:
return imdb_id
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
u.close()
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
return None
def getEpisodeData(title, episode, show_url = None):
'''
Collect information about an episode.
Returns dict with title, show, description and episode
'''
episodeData = {
'title': u'',
'show': title,
'description': u'',
'episode': episode,
}
description = u''
if not show_url:
imdbid = guess(title)
else:
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
if imdbid:
i = IMDb(imdbid).parse()
episodeData['title'] = i['episodes'][episode]['title']
episodeData['description'] = i['episodes'][episode]['description']
episodeData['imdb'] = i['episodes'][episode]['imdb']
return episodeData
if __name__ == '__main__':
import sys
#print parse(sys.argv[1])
print "imdb:", guess(sys.argv[1])