python-oxweb/ox/imdb.py

626 lines
20 KiB
Python
Raw Normal View History

2008-04-28 09:52:21 +00:00
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
from oxutils import *
import urllib2
2008-04-29 19:09:10 +00:00
from urllib import quote, unquote
2008-04-28 09:52:21 +00:00
import re, time
import os
import time
from BeautifulSoup import BeautifulSoup
import chardet
import oxutils
2008-04-29 22:24:25 +00:00
from oxutils import stripTags, htmldecode, findRegexp, findString
2008-04-28 09:52:21 +00:00
from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle, normalizeImdbId
2008-04-28 09:52:21 +00:00
import google
def getMovieId(title, director='', year=''):
if year:
title = "%s (%s)" % (title, year)
if director:
query = 'site:imdb.com %s "%s"' % (director, title)
else:
query = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(query, 3):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
def getMovieData(imdbId):
return IMDb(imdbId).parse()
# internal functions below
def getUrlBase(imdbId):
return "http://www.imdb.com/title/tt%s" % imdbId
def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId)
data = dict()
2008-04-29 19:09:10 +00:00
data['title'] = getTitle(imdbId)
data['credits'] = getCredits(imdbId)
data['poster'] = getPoster(imdbId)
data['trailers'] = getMovieTrailers(imdbId)
2008-04-29 22:15:28 +00:00
data['companyCredits'] = getMovieCompanyCredits(imdbId)
def parseBase(imdbId):
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
info = dict()
2008-04-29 22:15:28 +00:00
info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
title = stripTags(i[0]).strip().lower()
txt= stripTags(i[1]).strip()
def cleanUp(k):
k = htmldecode(k).replace(u'\xa0', ' ').strip()
if k.endswith('more'): k=k[:-len('more')].strip()
return k
txt = cleanUp(txt)
if title not in ('plot', 'trivia', 'filming locations', 'mpaa'):
if '|' in txt:
txt = [cleanUp(k) for k in txt.split('|')]
elif ', ' in txt:
txt = [cleanUp(k) for k in txt.split(', ')]
if not title.startswith('moviemeter'):
info[title] = txt
for key in ('user comments', 'writers (wga)'):
if key in info:
del info[key]
if 'release date' in info:
info['release date'] = info['release date'].split('\n')[0]
if 'plot' in info:
info['plot'] = info['plot'].split('| add synopsis')[0].strip()
#get Title
title = ''
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = htmldecode(html_title)
title = stripTags(title)
title = re.sub('\(\d\d\d\d\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
if title.find(u'\xa0') > -1:
2008-04-29 22:15:28 +00:00
title = title[:title.find(u'\xa0')].strip()
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
2008-04-29 22:15:28 +00:00
info['title'] = title
return info
def getPoster(imdbId):
info = parseBase(imdbId)
return info['poster']
def getTitle(imdbId):
info = parseBase(imdbId)
return info['title']
def creditList(data, section=None):
if section == 'cast':
credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
else:
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
credits = []
for c_ in credits_:
c = [c_[0].strip(), c_[1].strip()]
if section=='writers':
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
if c[1].endswith(' and'): c[1] = c[1][:-4]
credits.append(c)
return credits
2008-04-29 19:09:10 +00:00
def getCredits(imdbId):
credits = dict()
url = "%s/fullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
groups = data.split('<h5>')
for g in groups:
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
if section:
credits[section[0]] = creditList(g, section[0])
return credits
2008-04-29 19:09:10 +00:00
def getMovieTrailers(imdbId):
url = "%s/trailers" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
videos = soup('div', {'class':"video-gallery"})
trailers = []
if videos:
for a in videos[0]('a'):
title = stripTags(unicode(a)).strip()
url = 'http://www.imdb.com' + a['href']
videoId = findRegexp(url, '/(vi\d*?)/')
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
iframe = getUrlUnicode(iframeUrl)
videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers
2008-04-28 09:52:21 +00:00
2008-04-29 22:15:28 +00:00
def getMovieQuotes(imdbId):
url = "%s/quotes" % getUrlBase(imdbId)
data = getUrlUnicode(url)
2008-04-29 22:24:25 +00:00
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
2008-04-29 22:15:28 +00:00
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
return quotes
def getMovieTechnical(imdbId):
url = "%s/technical" % getUrlBase(imdbId)
data = getUrlUnicode(url)
results = {}
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
results[t[0].strip()] = t[1].strip()
return results
def getMovieCompanyCredits(imdbId):
url = "%s/companycredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
results = {}
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
results[field.strip()] = []
for company in re.compile('<li>(.*?)</li>').findall(c):
results[field.strip()].append(company)
return results
def getMovieLocations(imdbId):
url = "%s/locations" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string))
return locations
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
photos = {}
for key in keys:
url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
data = getUrlUnicode(url)
photos[key] = {}
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
img= "%s.jpg" % s[1].split('._V')[0]
title = s[0]
if key=='still_frame':
if not "_CR0" in s[1]:
photos[key][img] = title
else:
photos[key][img] = title
return photos
2008-04-29 19:09:10 +00:00
def getMovieStills(imdbId):
2008-04-29 22:15:28 +00:00
return getMovieImages(imdbId, ['still_frame'])['still_frame']
def getMoviePosters(imdbId):
return getMovieImages(imdbId, ['poster'])['poster']
def getMovieTrivia(imdbId):
url = "%s/trivia" % getUrlBase(imdbId)
2008-04-29 19:09:10 +00:00
data = getUrlUnicode(url)
2008-04-29 22:15:28 +00:00
soup = BeautifulSoup(data)
trivia = []
triviaList = []
for i in soup('ul', {'class': "trivia"}):
for t in i('li'):
t = str(t).replace('<br />', '').strip()
if t.startswith('<li>') and t.endswith('</li>'):
t = t[4:-5].strip()
trivia.append(t)
return trivia
2008-04-29 19:09:10 +00:00
'''the old code below'''
2008-04-28 09:52:21 +00:00
class IMDb:
def __init__(self, imdbId):
self.imdb = imdbId
2008-04-28 09:52:21 +00:00
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
self.businessUrl = "%sbusiness" % self.pageUrl
self.connectionsUrl = "%smovieconnections" % self.pageUrl
self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesUrl = "%sepisodes" % self.pageUrl
self.keywordUrl = "%skeywords" % self.pageUrl
self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.locationUrl = "%slocations" % self.pageUrl
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
def getPage(self):
return getUrlUnicode(self.pageUrl)
2008-04-28 09:52:21 +00:00
def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
value = unicode(value, 'utf-8')
value = stripTags(value).strip()
2008-04-28 09:52:21 +00:00
if key == 'runtime':
parsed_value = findRegexp(value, '(.*?) min')
parsed_value = findRegexp(parsed_value, '([0-9]+)')
2008-04-28 09:52:21 +00:00
if not parsed_value:
parsed_value = findRegexp(value, '(.*?) sec')
parsed_value = findRegexp(parsed_value, '([0-9]+)')
2008-04-28 09:52:21 +00:00
if not parsed_value:
parsed_value = 0
else:
parsed_value = int(parsed_value)
else:
parsed_value = int(parsed_value) * 60
elif key in ('country', 'language'):
parsed_value = value.split(' / ')
2008-04-29 19:09:10 +00:00
parsed_value = [v.strip() for v in parsed_value]
2008-04-28 09:52:21 +00:00
elif key == 'genre':
parsed_value = value.replace('more', '').strip().split(' / ')
2008-04-29 19:09:10 +00:00
parsed_value = [v.strip() for v in parsed_value]
2008-04-28 09:52:21 +00:00
elif key == 'tagline':
parsed_value = value.replace('more', '').strip()
elif key == 'plot_outline':
parsed_value = value.replace('(view trailer)', '').strip()
if parsed_value.endswith('more'):
parsed_value = parsed_value[:-4].strip()
elif key == 'tv_series':
m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
if m:
parsed_value = m[0][0]
else:
parsed_value = ''
elif key == 'also_known_as':
parsed_value = ''
m = re.compile('(.*) \(International: English title').findall(value)
if m:
parsed_value = m[0]
else:
m = re.compile('(.*) \(USA').findall(value)
if m:
parsed_value = m[0]
parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
2008-04-29 19:09:10 +00:00
director = self.getCredits().get('director', None)
2008-04-28 09:52:21 +00:00
if director:
director = director[0]
parsed_value = parsed_value.replace(director, '')
if parsed_value.startswith("'s"):
parsed_value = parsed_value[2:].strip()
parsed_value = parsed_value.strip()
else:
print value
parsed_value = value
return parsed_value
2008-04-28 09:52:21 +00:00
def parseTitle(self):
2008-04-29 19:09:10 +00:00
title = getTitle(self.imdb)
title = normalizeTitle(title)
if title.startswith('"') and title.find('"',1) > 0 and \
title.find('"',1) == title.rfind('"'):
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
if se:
se = se[0]
se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
else:
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
2008-04-28 09:52:21 +00:00
return normalizeTitle(title)
2008-04-28 09:52:21 +00:00
def parseYear(self):
year = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = stripTags(html_title)
year = re.compile('\((\d{4})\)').findall(html_title)
2008-04-28 09:52:21 +00:00
if not year:
year = re.compile('\((\d{4})/').findall(html_title)
if year:
2008-04-28 09:52:21 +00:00
year = year[0]
else: year = ''
return year
2008-04-28 09:52:21 +00:00
def parse(self):
data = self.getPage()
IMDbDict ={}
#Poster
2008-04-29 19:09:10 +00:00
IMDbDict['poster'] = getPoster(self.imdb)
2008-04-28 09:52:21 +00:00
if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
2008-04-28 09:52:21 +00:00
#Title, Year
IMDbDict['year'] = self.parseYear()
IMDbDict['title'] = self.parseTitle()
2008-04-28 09:52:21 +00:00
#Rating
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
if m:
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
else:
IMDbDict['rating'] = -1
#Votes
m = re.compile('<small>\(<a href="ratings">(.*?) votes</a>\)</small>', re.IGNORECASE).findall(data)
if m:
IMDbDict['votes'] = int(m[0].replace(',', ''))
else:
IMDbDict['votes'] = -1
data = data.replace('\n',' ')
#some values
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
for key in keys:
IMDbDict[key] = ''
IMDbDict['runtime'] = 0
soup = BeautifulSoup(data)
for info in soup('div', {'class': 'info'}):
key = str(info).split('</h5>')[0].split('<h5>')
if len(key) > 1:
raw_value = str(info).split('</h5>')[1]
key = key[1][:-1].lower().replace(' ', '_')
if key in keys:
IMDbDict[key] = self.parse_raw_value(key, raw_value)
IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
#is episode
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
IMDbDict['episodes'] = self.parseEpisodes()
if IMDbDict['episodes']:
IMDbDict['tvshow'] = True
else:
IMDbDict['tvshow'] = False
2008-04-29 19:09:10 +00:00
IMDbDict['credits'] = self.getCredits()
2008-04-28 09:52:21 +00:00
IMDbDict['plot'] = self.parsePlot()
IMDbDict['keywords'] = self.parseKeywords()
2008-04-29 22:15:28 +00:00
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
2008-04-28 09:52:21 +00:00
IMDbDict['connections'] = self.parseConnections()
IMDbDict['locations'] = self.parseLocations()
IMDbDict['release_date'] = self.parseReleaseinfo()
IMDbDict['business'] = self.parseBusiness()
IMDbDict['reviews'] = self.parseExternalreviews()
IMDbDict['stills'] = getMovieStills(self.imdb)
#IMDbDict['trailer'] = self.parseTrailer()
self.IMDbDict = IMDbDict
2008-04-28 09:52:21 +00:00
if IMDbDict['episode_of']:
episode_of =IMDb(IMDbDict['episode_of']).parse()
for key in ('country', 'language'):
if not IMDbDict[key]:
IMDbDict[key] = episode_of[key]
return self.IMDbDict
2008-04-29 19:09:10 +00:00
def getCredits(self):
raw_credits = getCredits(self.imdb)
2008-04-28 09:52:21 +00:00
credits = {}
def getNames(creditList):
return [stripTags(c[0]) for c in creditList]
credits['director'] = getNames(raw_credits['directors'])
credits['writer'] = getNames(raw_credits['writers'])
credits['producer'] = getNames(raw_credits['producers'])
credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']]
2008-04-28 09:52:21 +00:00
self.credits = credits
return self.credits
def parsePlot(self):
data = getUrlUnicode(self.plotUrl)
soup = BeautifulSoup(data)
2008-04-28 09:52:21 +00:00
plot = soup('p', {'class':'plotpar'})
if plot:
plot = unicode(plot[0]).split('<i>')[0]
else:
plot = u''
plot = stripTags(plot).strip()
self.plot = plot
return plot
2008-04-28 09:52:21 +00:00
def parseEpisodes(self):
episodes = {}
data = getUrlUnicode(self.episodesUrl)
cdata = data.replace('\r\n', ' ')
2008-04-28 09:52:21 +00:00
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
reg = re.compile(regexp, re.IGNORECASE)
m = reg.findall(cdata)
for match in m:
try:
episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
episodes[episode] = {}
episodes[episode]['imdb'] = match[2]
episodes[episode]['title'] = match[3].strip()
if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
episodes[episode]['title'] = u''
description = htmldecode(match[5])
description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description
episodes[episode]['date'] = ''
try:
d = stripTags(match[4])
d = d.replace('Original Air Date: ', '')
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
episodes[episode]['date'] = d
except:
pass
except:
import traceback
print traceback.print_exc()
pass
self.episodes = episodes
return self.episodes
def parseLocations(self):
data = getUrlUnicode(self.locationUrl)
soup = BeautifulSoup(data)
2008-04-28 09:52:21 +00:00
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string))
self.locations = locations
return self.locations
def parseKeywords(self):
data = getUrlUnicode(self.keywordUrl)
soup = BeautifulSoup(data)
2008-04-28 09:52:21 +00:00
keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}):
k = htmldecode(key.string)
k = k.replace(u'\xa0', ' ')
keywords.append(k)
self.keywords = keywords
return self.keywords
def getConnections(self):
return getUrlUnicode(self.connectionsUrl)
2008-04-28 09:52:21 +00:00
def parseConnections(self):
connections = {}
soup = BeautifulSoup(self.getConnections())
content = soup('div', {'id': 'tn15content'})[0]
blocks = str(content).split('<h5>')[1:]
for c in blocks:
connection = c.split('</h5>')[0]
cs = BeautifulSoup(c)
if connection:
#relation -> list of imdb ids
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
return connections
def getReleaseinfo(self):
return getUrlUnicode(self.releaseinfoUrl)
2008-04-28 09:52:21 +00:00
def parseReleaseinfo(self):
soup = BeautifulSoup(self.getReleaseinfo())
info = soup('table',{'border': '0', 'cellpadding':'2'})
if info:
for row in info[0]('tr'):
d = row('td', {'align':'right'})
if d:
try:
possible_date = stripTags(str(d[0])).strip()
rdate = time.strptime(possible_date, "%d %B %Y")
rdate = time.strftime('%Y-%m-%d', rdate)
return rdate
except:
pass
return None
def getBusiness(self):
return getUrlUnicode(self.businessUrl)
2008-04-28 09:52:21 +00:00
def parseBusiness(self):
soup = BeautifulSoup(self.getBusiness())
business = {'budget': 0, 'gross': 0, 'profit': 0}
content = soup('div', {'id': 'tn15content'})[0]
blocks = str(content).split('<h5>')[1:]
for c in blocks:
cs = BeautifulSoup(c)
line = c.split('</h5>')
if line:
title = line[0]
line = line[1]
if title in ['Budget', 'Gross']:
values = re.compile('\$(.*?) ').findall(line)
values = [int(value.replace(',','')) for value in values]
if values:
business[title.lower()] = max(values)
if business['budget'] and business['gross']:
business['profit'] = business['gross'] - business['budget']
return business
def getExternalreviews(self):
return getUrlUnicode(self.externalreviewsUrl)
2008-04-28 09:52:21 +00:00
def parseExternalreviews(self):
soup = BeautifulSoup(self.getExternalreviews())
ol = soup('ol')
if ol:
ol = ol[0]
ret = {}
for li in ol('li'):
try:
a = li('a')[0]
href = a.get('href')
txt = a.contents[0]
ret[href] = txt
except:
pass
return ret
return {}
2008-04-28 09:52:21 +00:00
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]
title = title.split('(')[0]
title = title.split('.')[0]
title = title.strip()
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
return_url = ''
#lest first try google
#i.e. site:imdb.com Michael Stevens Sin
if director:
search = 'site:imdb.com %s "%s"' % (director, title)
else:
search = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(search, 2):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
try:
req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
u.close()
except:
return None
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
if data:
imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
2008-04-28 09:52:21 +00:00
if imdb_id:
return imdb_id
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
u.close()
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
return None
def getEpisodeData(title, episode, show_url = None):
'''
Collect information about an episode.
2008-04-28 09:52:21 +00:00
Returns dict with title, show, description and episode
'''
episodeData = {
'title': u'',
'show': title,
'description': u'',
'episode': episode,
}
description = u''
if not show_url:
imdbid = guess(title)
else:
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
if imdbid:
i = IMDb(imdbid).parse()
episodeData['title'] = i['episodes'][episode]['title']
episodeData['description'] = i['episodes'][episode]['description']
episodeData['imdb'] = i['episodes'][episode]['imdb']
return episodeData
if __name__ == '__main__':
import sys
#print parse(sys.argv[1])
print "imdb:", guess(sys.argv[1])