# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
from oxutils import *
import urllib2
from urllib import quote
import re, time
import os
import time
from BeautifulSoup import BeautifulSoup
import chardet
import oxutils
from oxutils import stripTags, htmldecode, findRegexp
from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle, normalizeImdbId
import google
def getMovieId(title, director='', year=''):
if year:
title = "%s (%s)" % (title, year)
if director:
query = 'site:imdb.com %s "%s"' % (director, title)
else:
query = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(query, 3):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
def getMovieData(imdbId):
return IMDb(imdbId).parse()
# internal functions below
def getUrlBase(imdbId):
return "http://www.imdb.com/title/tt%s" % imdbId
def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId)
data = dict()
data['credits'] = parseCredits(imdbId)
data['poster'] = findRegexp(data, 'name="poster".*?', ' ').replace(' ', ' ')
title = htmldecode(html_title)
title = stripTags(title)
title = re.sub('\(\d\d\d\d\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')]
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
return title
def creditList(data, section=None):
if section == 'cast':
credits_ = re.compile('''
(.*?).*? | (.*?) |
''').findall(data)
else:
credits_ = re.compile('''.*?(.*?) | (.*?) |
''').findall(data)
credits = []
for c_ in credits_:
c = [c_[0].strip(), c_[1].strip()]
if section=='writers':
c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','')
if c[1].endswith(' and'): c[1] = c[1][:-4]
credits.append(c)
return credits
def parseCredits(imdbId):
credits = dict()
url = "%s/fullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
groups = data.split('')
for g in groups:
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
if section:
credits[section[0]] = creditList(g, section[0])
return credits
'''the old code below'''
def get_image(url):
return getUrl(url)
def _castList(data, regexp):
soup = re.compile(regexp).findall(data)
if soup:
soup = BeautifulSoup(soup[0])
names = []
for i in soup('a', {'href': re.compile('/name/nm')}):
if i.string:
cast = stripTags(i.string)
if cast not in names:
names.append(cast)
return names
return []
class IMDb:
def __init__(self, imdbId):
self.imdb = imdbId
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
self.businessUrl = "%sbusiness" % self.pageUrl
self.connectionsUrl = "%smovieconnections" % self.pageUrl
self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesUrl = "%sepisodes" % self.pageUrl
self.keywordUrl = "%skeywords" % self.pageUrl
self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.triviaUrl = "%strivia" % self.pageUrl
self.locationUrl = "%slocations" % self.pageUrl
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
self.trailerUrl = "%strailers" % self.pageUrl
def getPage(self):
return getUrlUnicode(self.pageUrl)
def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
value = unicode(value, 'utf-8')
value = stripTags(value).strip()
if key == 'runtime':
parsed_value = findRegexp(value, '(.*?) min')
parsed_value = findRegexp(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = findRegexp(value, '(.*?) sec')
parsed_value = findRegexp(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
else:
parsed_value = int(parsed_value)
else:
parsed_value = int(parsed_value) * 60
elif key in ('country', 'language'):
parsed_value = value.split(' / ')
elif key == 'genre':
parsed_value = value.replace('more', '').strip().split(' / ')
elif key == 'tagline':
parsed_value = value.replace('more', '').strip()
elif key == 'plot_outline':
parsed_value = value.replace('(view trailer)', '').strip()
if parsed_value.endswith('more'):
parsed_value = parsed_value[:-4].strip()
elif key == 'tv_series':
m = re.compile('(.*?)').findall(value)
if m:
parsed_value = m[0][0]
else:
parsed_value = ''
elif key == 'also_known_as':
parsed_value = ''
m = re.compile('(.*) \(International: English title').findall(value)
if m:
parsed_value = m[0]
else:
m = re.compile('(.*) \(USA').findall(value)
if m:
parsed_value = m[0]
parsed_value = parsed_value.split('
')[-1].split('(')[0]
director = self.parseCredits().get('director', None)
if director:
director = director[0]
parsed_value = parsed_value.replace(director, '')
if parsed_value.startswith("'s"):
parsed_value = parsed_value[2:].strip()
parsed_value = parsed_value.strip()
else:
print value
parsed_value = value
return parsed_value
def parseTitle(self):
title = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = html_title.replace('
', ' ').replace(' ', ' ')
title = stripTags(html_title)
title = re.sub('\(\d{4}\)', '', title)
title = re.sub('\(\d{4}/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')]
title = normalizeTitle(title.strip())
if title.startswith('"') and title.endswith('"'):
title = normalizeTitle(title[1:-1])
elif title.startswith('"') and title.find('"',1) > 0 and \
title.find('"',1) == title.rfind('"'):
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
if se:
se = se[0]
se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
else:
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
return normalizeTitle(title)
def parseYear(self):
year = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = stripTags(html_title)
year = re.compile('\((\d{4})\)').findall(html_title)
if not year:
year = re.compile('\((\d{4})/').findall(html_title)
if year:
year = year[0]
else: year = ''
return year
def parse(self):
data = self.getPage()
IMDbDict ={}
#Poster
IMDbDict['poster'] = findRegexp(data, 'name="poster".*?(.*?)/10', re.IGNORECASE).search(data)
if m:
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
else:
IMDbDict['rating'] = -1
#Votes
m = re.compile('\((.*?) votes\)', re.IGNORECASE).findall(data)
if m:
IMDbDict['votes'] = int(m[0].replace(',', ''))
else:
IMDbDict['votes'] = -1
data = data.replace('\n',' ')
#some values
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
for key in keys:
IMDbDict[key] = ''
IMDbDict['runtime'] = 0
soup = BeautifulSoup(data)
for info in soup('div', {'class': 'info'}):
key = str(info).split('
')[0].split('')
if len(key) > 1:
raw_value = str(info).split('
')[1]
key = key[1][:-1].lower().replace(' ', '_')
if key in keys:
IMDbDict[key] = self.parse_raw_value(key, raw_value)
IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
#is episode
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
IMDbDict['episodes'] = self.parseEpisodes()
if IMDbDict['episodes']:
IMDbDict['tvshow'] = True
else:
IMDbDict['tvshow'] = False
IMDbDict['credits'] = self.parseCredits()
IMDbDict['plot'] = self.parsePlot()
IMDbDict['keywords'] = self.parseKeywords()
IMDbDict['trivia'] = self.parseTrivia()
IMDbDict['connections'] = self.parseConnections()
IMDbDict['locations'] = self.parseLocations()
IMDbDict['release_date'] = self.parseReleaseinfo()
IMDbDict['business'] = self.parseBusiness()
IMDbDict['reviews'] = self.parseExternalreviews()
IMDbDict['stills'] = getMovieStills(self.imdb)
#IMDbDict['trailer'] = self.parseTrailer()
self.IMDbDict = IMDbDict
if IMDbDict['episode_of']:
episode_of =IMDb(IMDbDict['episode_of']).parse()
for key in ('country', 'language'):
if not IMDbDict[key]:
IMDbDict[key] = episode_of[key]
return self.IMDbDict
def parseCredits(self):
raw_credits = parseCredits(self.imdb)
credits = {}
def getNames(creditList):
return [stripTags(c[0]) for c in creditList]
credits['director'] = getNames(raw_credits['directors'])
credits['writer'] = getNames(raw_credits['writers'])
credits['producer'] = getNames(raw_credits['producers'])
credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']]
self.credits = credits
return self.credits
def parsePlot(self):
data = getUrlUnicode(self.plotUrl)
soup = BeautifulSoup(data)
plot = soup('p', {'class':'plotpar'})
if plot:
plot = unicode(plot[0]).split('')[0]
else:
plot = u''
plot = stripTags(plot).strip()
self.plot = plot
return plot
def parseEpisodes(self):
episodes = {}
data = getUrlUnicode(self.episodesUrl)
cdata = data.replace('\r\n', ' ')
regexp = r'''Season (.*?), Episode (.*?): (.*?)
(.*?)
(.*?)
'''
reg = re.compile(regexp, re.IGNORECASE)
m = reg.findall(cdata)
for match in m:
try:
episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
episodes[episode] = {}
episodes[episode]['imdb'] = match[2]
episodes[episode]['title'] = match[3].strip()
if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
episodes[episode]['title'] = u''
description = htmldecode(match[5])
description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description
episodes[episode]['date'] = ''
try:
d = stripTags(match[4])
d = d.replace('Original Air Date: ', '')
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
episodes[episode]['date'] = d
except:
pass
except:
import traceback
print traceback.print_exc()
pass
self.episodes = episodes
return self.episodes
def parseLocations(self):
data = getUrlUnicode(self.locationUrl)
soup = BeautifulSoup(data)
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string))
self.locations = locations
return self.locations
def parseKeywords(self):
data = getUrlUnicode(self.keywordUrl)
soup = BeautifulSoup(data)
keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}):
k = htmldecode(key.string)
k = k.replace(u'\xa0', ' ')
keywords.append(k)
self.keywords = keywords
return self.keywords
def parseTrivia(self):
data = getUrlUnicode(self.triviaUrl)
soup = BeautifulSoup(data)
trivia = []
triviaList = []
for i in soup('ul', {'class': "trivia"}):
for t in i('li'):
t = str(t).replace('
', '').strip()
if t.startswith('') and t.endswith(''):
t = t[4:-5].strip()
trivia.append(t)
self.trivia = trivia
return self.trivia
def getConnections(self):
return getUrlUnicode(self.connectionsUrl)
def parseConnections(self):
connections = {}
soup = BeautifulSoup(self.getConnections())
content = soup('div', {'id': 'tn15content'})[0]
blocks = str(content).split('')[1:]
for c in blocks:
connection = c.split('
')[0]
cs = BeautifulSoup(c)
if connection:
#relation -> list of imdb ids
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
return connections
def getReleaseinfo(self):
return getUrlUnicode(self.releaseinfoUrl)
def parseReleaseinfo(self):
soup = BeautifulSoup(self.getReleaseinfo())
info = soup('table',{'border': '0', 'cellpadding':'2'})
if info:
for row in info[0]('tr'):
d = row('td', {'align':'right'})
if d:
try:
possible_date = stripTags(str(d[0])).strip()
rdate = time.strptime(possible_date, "%d %B %Y")
rdate = time.strftime('%Y-%m-%d', rdate)
return rdate
except:
pass
return None
def getBusiness(self):
return getUrlUnicode(self.businessUrl)
def parseBusiness(self):
soup = BeautifulSoup(self.getBusiness())
business = {'budget': 0, 'gross': 0, 'profit': 0}
content = soup('div', {'id': 'tn15content'})[0]
blocks = str(content).split('')[1:]
for c in blocks:
cs = BeautifulSoup(c)
line = c.split('
')
if line:
title = line[0]
line = line[1]
if title in ['Budget', 'Gross']:
values = re.compile('\$(.*?) ').findall(line)
values = [int(value.replace(',','')) for value in values]
if values:
business[title.lower()] = max(values)
if business['budget'] and business['gross']:
business['profit'] = business['gross'] - business['budget']
return business
def getExternalreviews(self):
return getUrlUnicode(self.externalreviewsUrl)
def parseExternalreviews(self):
soup = BeautifulSoup(self.getExternalreviews())
ol = soup('ol')
if ol:
ol = ol[0]
ret = {}
for li in ol('li'):
try:
a = li('a')[0]
href = a.get('href')
txt = a.contents[0]
ret[href] = txt
except:
pass
return ret
return {}
def getTrailer(self):
return getUrlUnicode(self.trailerUrl)
def parseTrailer(self):
ret = {}
soup = BeautifulSoup(self.getTrailer())
for p in soup('p'):
if p('a') and p.firstText():
a = p('a')[0]
href = a['href']
if href and href.startswith('http'):
title = a.string
title = title.replace('www.', '')
ret[href] = title
return ret
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]
title = title.split('(')[0]
title = title.split('.')[0]
title = title.strip()
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
return_url = ''
#lest first try google
#i.e. site:imdb.com Michael Stevens Sin
if director:
search = 'site:imdb.com %s "%s"' % (director, title)
else:
search = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(search, 2):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
try:
req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
u.close()
except:
return None
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
if data:
imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?- .*? int(s[1]):
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
if not stills:
s_ = re.compile(''' int(s[1]):
stills.append("http://%sf.jpg" % s[2])
return stills
if __name__ == '__main__':
import sys
#print parse(sys.argv[1])
print "imdb:", guess(sys.argv[1])