# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
from oxutils import *
import urllib2
from urllib import quote, unquote
import re, time
import os
import time
from BeautifulSoup import BeautifulSoup
import chardet
import oxutils
from oxutils import stripTags, decodeHtml, findRe, findString
from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle, normalizeImdbId
import google
_timer = -1
_timer_last = -1
def debugTime(message=''):
global _timer, _timer_last
if _timer == -1:
_timer = time.time()
if _timer_last == -1:
_timer_last = time.time()
now = time.time()
print message," since last: %0.2f total time: %0.2f" % (now-_timer_last, now-_timer)
_timer_last = now
def getMovieId(title, director='', year=''):
>>> getMovieId('The Matrix')
if year:
title = "%s (%s)" % (title, year)
if director:
query = ' %s "%s"' % (director, title)
query = ' "%s"' % title
for (name, url, desc) in google.find(query, 3):
if url.startswith(''):
return url[28:35]
return ''
def getMovieData(imdbId):
return IMDb(imdbId).parse()
# internal functions below
def getUrlBase(imdbId):
return "" % imdbId
def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId)
data = getMovieInfo(imdbId)
data['credits'] = getMovieCredits(imdbId)
data['poster'] = getMoviePoster(imdbId)
data['company credits'] = getMovieCompanyCredits(imdbId)
data['filming locations'] = getMovieLocations(imdbId)
data['movie connections'] = getMovieConnections(imdbId)
data['external reviews'] = getMovieExternalReviews(imdbId)
data['trivia'] = getMovieTrivia(imdbId)
data['keywords'] = getMovieKeywords(imdbId)
data['media'] = {}
data['media']['images'] = getMovieImages(imdbId)
data['media']['trailers'] = getMovieTrailers(imdbId)
data['plotsummary'] = getMoviePlot(imdbId)
data['release dates'] = getMovieReleaseDates(imdbId)
data['release date'] = getMovieReleaseDate(imdbId)
return data
def getMovieInfo(imdbId):
data = getUrlUnicode(getUrlBase(imdbId))
info = dict()
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
if info['poster'] and '_V' in info['poster']:
info['poster']= "%s.jpg" % info['poster'].split('._V')[0]
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
title = stripTags(i[0]).strip().lower()
txt= stripTags(i[1]).strip()
def cleanUp(k):
k = decodeHtml(k).replace(u'\xa0', ' ').strip()
if k.endswith('more'): k=k[:-len('more')].strip()
return k
txt = cleanUp(txt)
if title not in ('plot', 'trivia', 'filming locations', 'mpaa'):
if '|' in txt:
txt = [cleanUp(k) for k in txt.split('|')]
elif ', ' in txt:
txt = [cleanUp(k) for k in txt.split(', ')]
if not title.startswith('moviemeter'):
info[title] = txt
for key in ('user comments', 'writers (wga)'):
if key in info:
del info[key]
if 'release date' in info:
info['release date'] = info['release date'].split('\n')[0]
if 'plot' in info:
info['plot'] = info['plot'].split('| add synopsis')[0].strip()
info['plot'] = info['plot'].split('| full synopsis')[0].strip()
if info['plot'] in ('add synopsis', 'full synopsis'):
info['plot'] = ''
#get Title
title = ''
year = ''
html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
if not html_title:
html_title = findRe(data, '<title>(.*?)</title>')
if html_title:
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = decodeHtml(html_title)
title = stripTags(title)
year = findRe(title, '\((\d{4})\)')
if not year:
year = findRe(title, '\((\d{4})')
_y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))')
if _y:
title = title.replace(_y, '')
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')].strip()
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
info['title'] = title
info['year'] = year
rating = findRe(data, '<b>([\d\.]*?)/10</b>')
if rating:
info['rating'] = float(rating)
info['rating'] = -1
votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
if votes:
info['votes'] = int(votes.replace(',', ''))
info['votes'] = -1
return info
def getMoviePoster(imdbId):
info = getMovieInfo(imdbId)
return info['poster']
def getMovieYear(imdbId):
info = getMovieInfo(imdbId)
return info['year']
def getMovieTitle(imdbId):
info = getMovieInfo(imdbId)
return info['title']
def creditList(data, section=None):
if section == 'cast':
credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
credits = []
for c_ in credits_:
c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()]
if section=='writers':
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
if c[1].endswith(' and'): c[1] = c[1][:-4]
return credits
def getMovieCredits(imdbId):
credits = dict()
url = "%s/fullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
groups = data.split('<h5>')
for g in groups:
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
if section:
credits[section[0]] = creditList(g, section[0])
return credits
def getMovieTrailers(imdbId):
url = "%s/trailers" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
videos = soup('div', {'class':"video-gallery"})
trailers = []
if videos:
for a in videos[0]('a'):
title = stripTags(unicode(a)).strip()
url = '' + a['href']
videoId = findRe(url, '/(vi\d*?)/')
iframeUrl = "" % videoId
iframe = getUrlUnicode(iframeUrl)
videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers
def getMovieQuotes(imdbId):
url = "%s/quotes" % getUrlBase(imdbId)
data = getUrlUnicode(url)
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
return quotes
def getMoviePlot(imdbId):
url = "%s/plotsummary" % getUrlBase(imdbId)
data = getUrlUnicode(url)
plot = findRe(data, '<p class="plotpar">(.*?)<i>')
return plot
def getMovieTechnical(imdbId):
url = "%s/technical" % getUrlBase(imdbId)
data = getUrlUnicode(url)
results = {}
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
results[t[0].strip()] = t[1].strip()
return results
def getMovieCompanyCredits(imdbId):
url = "%s/companycredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
results = {}
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
results[field.strip()] = []
for company in re.compile('<li>(.*?)</li>').findall(c):
return results
def getMovieLocations(imdbId):
url = "%s/locations" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
return locations
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
photos = {}
for key in keys:
url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
data = getUrlUnicode(url)
photos[key] = {}
for s in re.compile('''<img alt="(.*?)".*?src="(*?.jpg)''').findall(data):
img= "%s.jpg" % s[1].split('._V')[0]
title = s[0]
if key=='still_frame':
if not "_CR0" in s[1]:
photos[key][img] = title
photos[key][img] = title
return photos
def getMovieStills(imdbId):
return getMovieImages(imdbId, ['still_frame'])['still_frame']
def getMoviePosters(imdbId):
posters = getMovieImages(imdbId, ['poster'])['poster']
poster = getMoviePoster(imdbId)
if poster:
posters[poster] = 'main poster'
return posters
def getMovieTrivia(imdbId):
url = "%s/trivia" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
trivia = []
triviaList = []
for i in soup('ul', {'class': "trivia"}):
for t in i('li'):
t = unicode(t).replace('<br />', '').strip()
if t.startswith('<li>') and t.endswith('</li>'):
t = t[4:-5].strip()
return trivia
def getMovieConnections(imdbId):
url = "%s/movieconnections" % getUrlBase(imdbId)
data = getUrl(url)
for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
return connections
def getMovieKeywords(imdbId):
url = "%s/keywords" % getUrlBase(imdbId)
data = getUrlUnicode(url)
keywords = []
for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
keyword = decodeHtml(keyword)
keyword = keyword.replace(u'\xa0', ' ')
return keywords
def getMovieExternalReviews(imdbId):
url = "%s/externalreviews" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
ol = soup('ol')
if ol:
ol = ol[0]
ret = {}
for li in ol('li'):
a = li('a')[0]
href = a.get('href')
txt = a.contents[0]
ret[href] = txt
return ret
return {}
def getMovieReleaseDate(imdbId):
releasedates = getMovieReleaseDates(imdbId)
first_release = ''
for r in releasedates:
if not first_release or r[1] < first_release:
first_release = r[1]
return first_release
def getMovieReleaseDates(imdbId):
url = "%s/releaseinfo" % getUrlBase(imdbId)
data = getUrlUnicode(url)
releasedates = []
regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
def _parse_date(d):
parsed_date = time.strptime(d, "%d %B %Y")
parsed_date = time.strftime('%Y-%m-%d', parsed_date)
return parsed_date
return d
for r in re.compile(regexp, re.DOTALL).findall(data):
r_ = (stripTags(r[0]).strip(),
return releasedates
soup = BeautifulSoup(data)
info = soup('table',{'border': '0', 'cellpadding':'2'})
if info:
for row in info[0]('tr'):
d = row('td', {'align':'right'})
if d:
possible_date = stripTags(unicode(d[0])).strip()
rdate = time.strptime(possible_date, "%d %B %Y")
rdate = time.strftime('%Y-%m-%d', rdate)
return rdate
return None
def getMovieBusinessSum(imdbId):
business = getMovieBusiness(imdbId)
b_ = {'budget': 0, 'gross': 0, 'profit': 0}
if 'budget' in business:
b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
if 'gross' in business:
b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
if 'weekend gross' in business:
b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
if b_['budget'] and b_['gross']:
b_['profit'] = b_['gross'] - b_['budget']
return b_
def getMovieFlimingDates(imdbId):
business = getMovieBusiness(imdbId)
if 'filming dates' in business and business['filming dates']:
return business['filming dates'][0]
return ''
def getMovieBusiness(imdbId):
url = "%s/business" % getUrlBase(imdbId)
data = getUrlUnicode(url)
business = {}
for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
key = stripTags(r[0]).strip().lower()
value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('<br/>')]
business[key] = value
return business
soup = BeautifulSoup(data)
business = {'budget': 0, 'gross': 0, 'profit': 0}
content = soup('div', {'id': 'tn15content'})[0]
blocks = unicode(content).split('<h5>')[1:]
for c in blocks:
cs = BeautifulSoup(c)
line = c.split('</h5>')
if line:
title = line[0]
line = line[1]
if title in ['Budget', 'Gross']:
values = re.compile('\$(.*?) ').findall(line)
values = [int(value.replace(',','')) for value in values]
if values:
business[title.lower()] = max(values)
if business['budget'] and business['gross']:
business['profit'] = business['gross'] - business['budget']
return business
def getMovieEpisodes(imdbId):
url = "%s/episodes" % getUrlBase(imdbId)
data = getUrlUnicode(url)
episodes = {}
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
for r in re.compile(regexp, re.DOTALL).findall(data):
episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
episodes[episode] = {}
episodes[episode]['imdb'] = r[2]
episodes[episode]['title'] = r[3].strip()
if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
episodes[episode]['title'] = u''
description = decodeHtml(r[5])
description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description.strip()
episodes[episode]['date'] = ''
d = stripTags(r[4])
d = d.replace('Original Air Date: ', '')
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
episodes[episode]['date'] = d
import traceback
print traceback.print_exc()
return episodes
'''the old code below'''
class IMDb:
def __init__(self, imdbId): = imdbId
self.pageUrl = "" %
def getPage(self):
return getUrlUnicode(self.pageUrl)
def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
value = stripTags(value).strip()
if key == 'runtime':
parsed_value = findRe(value, '(.*?) min')
parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = findRe(value, '(.*?) sec')
parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
parsed_value = int(parsed_value)
parsed_value = int(parsed_value) * 60
elif key in ('country', 'language'):
parsed_value = value.split(' / ')
if len(parsed_value) == 1:
parsed_value = parsed_value[0].split(' | ')
parsed_value = [v.strip() for v in parsed_value]
elif key == 'genre':
parsed_value = value.replace('more', '').strip().split(' / ')
if len(parsed_value) == 1:
parsed_value = parsed_value[0].split(' | ')
parsed_value = [v.strip() for v in parsed_value]
elif key == 'tagline':
parsed_value = value.replace('more', '').strip()
elif key == 'plot_outline':
parsed_value = value.replace('(view trailer)', '').strip()
if parsed_value.endswith('more'):
parsed_value = parsed_value[:-4].strip()
elif key == 'tv_series':
m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
if m:
parsed_value = m[0][0]
parsed_value = ''
elif key == 'also_known_as':
parsed_value = ''
m = re.compile('(.*) \(International: English title').findall(value)
if m:
parsed_value = m[0]
m = re.compile('(.*) \(USA').findall(value)
if m:
parsed_value = m[0]
parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
director = self.getCredits().get('director', None)
if director:
director = director[0]
parsed_value = parsed_value.replace(director, '')
if parsed_value.startswith("'s"):
parsed_value = parsed_value[2:].strip()
parsed_value = decodeHtml(parsed_value.strip())
print value
parsed_value = value
return parsed_value
def parseTitle(self):
title = getMovieTitle(
title = normalizeTitle(title)
if title.startswith('"') and title.find('"',1) > 0 and \
title.find('"',1) == title.rfind('"'):
data = self.getPage()
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
if se:
se = se[0]
se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1]))
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip()
part2 = title[title.rfind('"')+1:]
part2 = re.sub("[\d\?-]", "", part2).strip()
title = normalizeTitle(title[1:title.rfind('"')])
if part2:
title += ':' + part2
return normalizeTitle(title)
def parseYear(self):
year = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = unicode(html_title[0])
html_title = stripTags(html_title)
year = re.compile('\((\d{4})\)').findall(html_title)
if not year:
year = re.compile('\((\d{4})/').findall(html_title)
if year:
year = year[0]
else: year = ''
return year
def parse(self):
data = self.getPage()
IMDbDict ={}
IMDbDict['poster'] = getMoviePoster(
if not IMDbDict['poster']:
IMDbDict['poster'] = ''
#Title, Year
IMDbDict['year'] = self.parseYear()
IMDbDict['title'] = self.parseTitle()
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
if m:
IMDbDict['rating'] = int(float( * 1000)
IMDbDict['rating'] = -1
m = re.compile('<small>\(<a href="ratings">(.*?) votes</a>\)</small>', re.IGNORECASE).findall(data)
if m:
IMDbDict['votes'] = int(m[0].replace(',', ''))
IMDbDict['votes'] = -1
data = data.replace('\n',' ')
#some values
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
for key in keys:
IMDbDict[key] = ''
IMDbDict['runtime'] = 0
soup = BeautifulSoup(data)
for info in soup('div', {'class': 'info'}):
key = unicode(info).split('</h5>')[0].split('<h5>')
if len(key) > 1:
raw_value = unicode(info).split('</h5>')[1]
key = key[1][:-1].lower().replace(' ', '_')
if key in keys:
IMDbDict[key] = self.parse_raw_value(key, raw_value)
IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
#is episode
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
IMDbDict['episodes'] = getMovieEpisodes(
if IMDbDict['episodes']:
IMDbDict['tvshow'] = True
IMDbDict['tvshow'] = False
IMDbDict['credits'] = self.getCredits()
IMDbDict['plot'] = getMoviePlot(
IMDbDict['keywords'] = getMovieKeywords(
IMDbDict['trivia'] = getMovieTrivia(
IMDbDict['connections'] = getMovieConnections(
IMDbDict['locations'] = getMovieLocations(
IMDbDict['release_date'] = getMovieReleaseDate(
IMDbDict['business'] = getMovieBusinessSum(
IMDbDict['reviews'] = getMovieExternalReviews(
IMDbDict['stills'] = getMovieStills(
#IMDbDict['trailer'] = getMovieTrailer(
self.IMDbDict = IMDbDict
if IMDbDict['episode_of']:
episode_of =IMDb(IMDbDict['episode_of']).parse()
for key in ('country', 'language'):
if not IMDbDict[key]:
IMDbDict[key] = episode_of[key]
return self.IMDbDict
def getCredits(self):
raw_credits = getMovieCredits(
credits = {}
def getNames(creditList):
return [stripTags(decodeHtml(c[0])) for c in creditList]
credits['director'] = getNames(raw_credits.get('directors', ''))
credits['writer'] = getNames(raw_credits.get('writers', ''))
credits['producer'] = getNames(raw_credits.get('producers', ''))
credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])]
self.credits = credits
return self.credits
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]
title = title.split('(')[0]
title = title.split('.')[0]
title = title.strip()
imdb_url = '' % quote(title.encode('utf-8'))
return_url = ''
#lest first try google
#i.e. Michael Stevens Sin
if director:
search = ' %s "%s"' % (director, title)
search = ' "%s"' % title
for (name, url, desc) in google.find(search, 2):
if url.startswith(''):
return url[28:35]
req = urllib2.Request(imdb_url, None,
u = urllib2.urlopen(req)
data =
return_url = u.url
return None
if return_url.startswith(''):
return return_url[28:35]
if data:
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id:
return imdb_id
imdb_url = ';s=tt;site=aka' % quote(title.encode('utf-8'))
req = urllib2.Request(imdb_url, None,
u = urllib2.urlopen(req)
data =
return_url = u.url
if return_url.startswith(''):
return return_url[28:35]
return None
def getEpisodeData(title, episode, show_url = None):
Collect information about an episode.
Returns dict with title, show, description and episode
episodeData = {
'title': u'',
'show': title,
'description': u'',
'episode': episode,
description = u''
if not show_url:
imdbid = guess(title)
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
if imdbid:
i = IMDb(imdbid).parse()
episodeData['title'] = i['episodes'][episode]['title']
episodeData['description'] = i['episodes'][episode]['description']
episodeData['imdb'] = i['episodes'][episode]['imdb']
return episodeData
if __name__ == '__main__':
import sys
#print parse(sys.argv[1])
print "imdb:", guess(sys.argv[1])