# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import urllib2
from urllib import quote, unquote
import re
import os
import time
from BeautifulSoup import BeautifulSoup
import chardet
import oxlib
from oxlib import stripTags, decodeHtml, findRe, findString
import oxlib.cache
from oxlib.normalize import normalizeTitle, normalizeImdbId
from oxlib import *
import google
'''
never timeout imdb data, to update cache remove data from cache folder
'''
def readUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1):
return oxlib.cache.readUrlUnicode(url, data, headers, timeout)
'''
check if result is valid while updating
def validate(result, header):
return header['status'] == u'200'
try:
d = oxlib.cache.readUrlUnicode(url, data, headers, timeout=0, valid=validate)
except oxlib.cache.InvalidResult, e:
print e.headers
'''
def getMovieId(title, director='', year=''):
'''
>>> getMovieId('The Matrix')
'0133093'
'''
if year:
title = "%s (%s)" % (title, year)
if director:
query = 'site:imdb.com %s "%s"' % (director, title)
else:
query = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(query, 3, timeout=-1):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
return ''
def getMovieData(imdbId):
return IMDb(imdbId).parse()
# internal functions below
def getUrlBase(imdbId):
return "http://www.imdb.com/title/tt%s/" % imdbId
def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId)
data = getMovieInfo(imdbId)
data['credits'] = getMovieCredits(imdbId)
data['poster'] = getMoviePoster(imdbId)
data['company credits'] = getMovieCompanyCredits(imdbId)
data['filming locations'] = getMovieLocations(imdbId)
data['movie connections'] = getMovieConnections(imdbId)
data['external reviews'] = getMovieExternalReviews(imdbId)
data['trivia'] = getMovieTrivia(imdbId)
data['keywords'] = getMovieKeywords(imdbId)
data['media'] = {}
data['media']['images'] = getMovieImages(imdbId)
data['media']['trailers'] = getMovieTrailers(imdbId)
data['plotsummary'] = getMoviePlot(imdbId)
data['release dates'] = getMovieReleaseDates(imdbId)
data['release date'] = getMovieReleaseDate(imdbId)
return data
def getMovieInfo(imdbId, timeout=-1):
data = readUrlUnicode(getUrlBase(imdbId), timeout=timeout)
info = dict()
info['poster'] = findRe(data, 'name="poster".*?(.*?):(.*?)
')[0]
else:
txt= i[1]
txt = stripTags(txt).strip()
def cleanUp(k):
k = decodeHtml(k).replace(u'\xa0', ' ').strip()
if k.endswith('more'): k=k[:-len('more')].strip()
return k
txt = cleanUp(txt)
if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline', 'original air date'):
if '|' in txt:
txt = [cleanUp(k) for k in txt.split('|')]
elif ', ' in txt:
txt = [cleanUp(k) for k in txt.split(', ')]
elif title in ('country', 'language', 'genre'):
txt = [cleanUp(txt), ]
if title == 'tv series':
info['series_imdb'] = findRe(i[1], 'tt(\d{7})')
if title == 'original air date':
info['series_episode_info'] = txt.split('\n')[-1].strip()
txt = txt.split('\n')[0].strip()
if not title.startswith('moviemeter'):
info[title] = txt
for key in ('user comments', 'writers (wga)', 'plot keywords'):
if key in info:
del info[key]
if 'release date' in info:
if isinstance(info['release date'], list):
info['release date'] = info['release date'][0]
info['release date'] = info['release date'].split('\n')[0]
if 'plot' in info:
info['plot'] = info['plot'].split('| add synopsis')[0].strip()
info['plot'] = info['plot'].split('| full synopsis')[0].strip()
if info['plot'] in ('add synopsis', 'full synopsis'):
info['plot'] = ''
#get Title
title = ''
year = ''
html_title = findRe(data, '
(.*?)
')
if not html_title:
html_title = findRe(data, '
(.*?)')
else:
html_title = html_title.split('
')[0]
if html_title:
html_title = html_title.replace('
', ' ').replace(' ', ' ')
title = stripTags(html_title)
title = decodeHtml(title)
year = findRe(title, '\((\d{4})\)')
if not year:
year = findRe(title, '\((\d{4})')
_y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))')
if _y:
title = title.replace(_y, '')
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')].strip()
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
info['title'] = normalizeTitle(title)
info['year'] = year
#Series
if title.startswith('"') and title.find('"',1) > 0 and \
title.find('"',1) == title.rfind('"'):
episode_title = title[title.rfind('"')+1:]
episode_title = re.sub("\?{4}", "", episode_title).strip()
episode_title = re.sub("\d{4}", "", episode_title).strip()
if episode_title == '-': episode_title=''
title = normalizeTitle(title[1:title.rfind('"')])
if episode_title:
info['episode title'] = episode_title
info['series title'] = title
info['title'] = "%s: %s" % (title, episode_title)
else:
info['title'] = title
se = re.compile("Season (\d*), Episode (\d*)\)").findall(info.get('series_episode_info', ''))
if se:
info['season'] = int(se[0][0])
info['episode'] = int(se[0][1])
info['title'] = "%s (S%02dE%02d) %s" % (
info['series title'], info['season'], info['episode'], info['episode title'])
info['title'] = info['title'].strip()
del info['series_episode_info']
#Rating
rating = findRe(data, '([\d\.]*?)/10')
if rating:
info['rating'] = float(rating)
else:
info['rating'] = -1
#Votes
info['votes'] = -1
if "user rating" in info:
if isinstance(info['user rating'], list):
info['user rating'] = ' '.join(info['user rating'])
votes = findRe(info['user rating'], '([\d,]*?) votes')
if votes:
info['votes'] = int(votes.replace(',', ''))
return info
def getMovieRuntimeSeconds(imdbId):
info = getMovieInfo(imdbId)
if 'runtime' in info:
value = info['runtime'][0]
parsed_value = findRe(value, '(.*?) min')
parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = findRe(value, '(.*?) sec')
parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
else:
parsed_value = int(parsed_value)
else:
parsed_value = int(parsed_value) * 60
else:
parsed_value = -1
return parsed_value
def getMoviePoster(imdbId):
info = getMovieInfo(imdbId)
return info['poster']
def getMovieYear(imdbId):
'''
>>> getMovieYear('0315404')
u'1964'
>>> getMovieYear('0734840')
u'1990'
>>> getMovieYear('0815352')
u'1964'
'''
info = getMovieInfo(imdbId)
return info['year']
def getMovieTitle(imdbId):
'''
>>> getMovieTitle('0306414')
u'The Wire'
>>> getMovieTitle('0734840')
u'Twin Peaks (S01E02) Episode #1.2'
>>> getMovieTitle('0734840')
u'Twin Peaks (S01E02) Episode #1.2'
>>> getMovieTitle('0749451')
u'The Wire (S01E01) The Target'
'''
info = getMovieInfo(imdbId)
return info['title']
def getMovieAKATitles(imdbId):
'''
>>> getMovieAKATitle('0040980')
[(u'Frauen der Nacht', u'Germany'),
(u'Les femmes de la nuit', u'France'),
(u'Women of the Night', u'(undefined)')]
'''
url = "%sreleaseinfo" % getUrlBase(imdbId)
data = readUrlUnicode(url)
titles = findRe(data, 'name="akas".*?(.*?)')
titles = re.compile("td>(.*?)\n\n(.*) | ").findall(titles)
return titles
def creditList(data, section=None):
if section == 'cast':
credits_ = re.compile('''(.*?).*? | (.*?) |
''').findall(data)
else:
credits_ = re.compile('''.*?(.*?) | (.*?) |
''').findall(data)
credits = []
for c_ in credits_:
c = [stripTags(decodeHtml(c_[0]).strip()), stripTags(decodeHtml(c_[1]).strip())]
if section=='writers':
c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','')
if c[1].endswith(' and'): c[1] = c[1][:-4]
credits.append(c)
return credits
def getMovieCredits(imdbId):
credits = dict()
url = "%sfullcredits" % getUrlBase(imdbId)
data = readUrlUnicode(url)
groups = data.split('(.*?)').findall(data)
reviews = {}
for r in _reviews:
reviews[r[0]] = r[1]
return reviews
def getMovieReleaseDate(imdbId):
releasedates = getMovieReleaseDates(imdbId)
first_release = None
for r in releasedates:
if not first_release or r[1] < first_release:
first_release = r[1]
return first_release
def _parseDate(d):
'''
>>>_parseDate('3 March 1972')
'1972-03-03'
'''
try:
parsed_date = time.strptime(d, "%d %B %Y")
parsed_date = '%s-%02d-%02d' % (parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday)
return parsed_date
except:
try:
parsed_date = time.strptime(d, "%B %Y")
parsed_date = '%s-%02d-01' % (parsed_date.tm_year, parsed_date.tm_mon)
return parsed_date
except:
pass
try:
parsed_date = time.strptime(d, "%Y")
parsed_date = '%s-01-01' % (parsed_date.tm_year)
return parsed_date
except:
pass
return d
def getMovieReleaseDates(imdbId):
url = "%sreleaseinfo" % getUrlBase(imdbId)
data = readUrlUnicode(url)
releasedates = []
regexp = '''(.*?) | .*?(.*?) | .*?(.*?) |
'''
for r in re.compile(regexp, re.DOTALL).findall(data):
r_ = (stripTags(r[0]).strip(),
_parseDate(stripTags(r[1]).strip()),
decodeHtml(stripTags(r[2]).strip()))
releasedates.append(r_)
return releasedates
def getMovieBusinessSum(imdbId):
business = getMovieBusiness(imdbId)
b_ = {'budget': 0, 'gross': 0, 'profit': 0}
if 'budget' in business:
#b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
budget = filter(lambda x: x.startswith('$'), business['budget'])
if not budget:
budget = business['budget']
b_['budget'] = int(intValue(budget[0].replace(',', '')))
if 'gross' in business:
gross = filter(lambda x: x.startswith('$'), business['gross'])
if gross:
b_['gross'] = int(intValue(gross[0].replace(',', '')))
#b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
#if 'weekend gross' in business:
# b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
if b_['budget'] and b_['gross']:
b_['profit'] = b_['gross'] - b_['budget']
return b_
def getMovieFlimingDates(imdbId):
business = getMovieBusiness(imdbId)
if 'filming dates' in business and business['filming dates']:
return business['filming dates'][0]
return ''
def getMovieBusiness(imdbId):
url = "%sbusiness" % getUrlBase(imdbId)
data = readUrlUnicode(url)
business = {}
for r in re.compile('''(.*?)
(.*?)
.
''', re.DOTALL).findall(data):
key = stripTags(r[0]).strip().lower()
value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('
')]
business[key] = value
return business
def getMovieEpisodes(imdbId):
url = "%sepisodes" % getUrlBase(imdbId)
data = readUrlUnicode(url)
episodes = {}
regexp = r'''Season (.*?), Episode (.*?): (.*?)
(.*?)
(.*?)
'''
for r in re.compile(regexp, re.DOTALL).findall(data):
try:
episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
episodes[episode] = {}
episodes[episode]['imdb'] = r[2]
episodes[episode]['title'] = r[3].strip()
if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
episodes[episode]['title'] = u''
description = decodeHtml(r[5])
description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description.strip()
episodes[episode]['date'] = ''
try:
d = stripTags(r[4])
d = d.replace('Original Air Date: ', '')
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
episodes[episode]['date'] = d
except:
pass
except:
import traceback
print traceback.print_exc()
pass
return episodes
'''the old code below'''
class IMDb:
def __init__(self, imdbId):
self.imdb = imdbId
self.pageUrl = getUrlBase(imdbId)
def getPage(self):
return readUrlUnicode(self.pageUrl)
def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
value = stripTags(value).strip()
if key == 'runtime':
parsed_value = getMovieRuntimeSeconds(self.imdb)
elif key in ('country', 'language'):
parsed_value = value.split(' / ')
if len(parsed_value) == 1:
parsed_value = parsed_value[0].split(' | ')
parsed_value = [v.strip() for v in parsed_value]
elif key == 'genre':
parsed_value = value.replace('more', '').strip().split(' / ')
if len(parsed_value) == 1:
parsed_value = parsed_value[0].split(' | ')
parsed_value = [v.strip() for v in parsed_value]
elif key == 'tagline':
parsed_value = value.replace('more', '').strip()
elif key == 'plot_outline':
parsed_value = value.replace('(view trailer)', '').strip()
if parsed_value.endswith('more'):
parsed_value = parsed_value[:-4].strip()
elif key == 'tv_series':
m = re.compile('(.*?)').findall(value)
if m:
parsed_value = m[0][0]
else:
parsed_value = ''
elif key == 'also_known_as':
parsed_value = ''
m = re.compile('(.*) \(International: English title').findall(value)
if m:
parsed_value = m[0]
else:
m = re.compile('(.*) \(USA').findall(value)
if m:
parsed_value = m[0]
parsed_value = parsed_value.split('
')[-1].split('(')[0]
director = self.getCredits().get('director', None)
if director:
director = director[0]
parsed_value = parsed_value.replace(director, '')
if parsed_value.startswith("'s"):
parsed_value = parsed_value[2:].strip()
parsed_value = decodeHtml(parsed_value.strip())
else:
print value
parsed_value = value
return parsed_value
def parseYear(self):
return getMovieYear(self.imdb)
def parse(self):
data = self.getPage()
IMDbDict ={}
info = getMovieInfo(self.imdb)
#Poster
IMDbDict['poster'] = getMoviePoster(self.imdb)
if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
#Title, Year
IMDbDict['year'] = self.parseYear()
IMDbDict['title'] = getMovieTitle(self.imdb)
#Rating
m = re.compile('(.*?)/10', re.IGNORECASE).search(data)
if m:
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
else:
IMDbDict['rating'] = -1
#Votes
IMDbDict['votes'] = info['votes']
data = data.replace('\n',' ')
#some values
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
for key in keys:
IMDbDict[key] = ''
IMDbDict['runtime'] = 0
soup = BeautifulSoup(data)
for info in soup('div', {'class': 'info'}):
key = unicode(info).split('')[0].split('')
if len(key) > 1:
raw_value = unicode(info).split('
')[1]
key = key[1][:-1].lower().replace(' ', '_')
if key in keys:
IMDbDict[key] = self.parse_raw_value(key, raw_value)
IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
#is episode
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
if IMDbDict['episodes']:
IMDbDict['tvshow'] = True
else:
IMDbDict['tvshow'] = False
IMDbDict['credits'] = self.getCredits()
IMDbDict['plot'] = getMoviePlot(self.imdb)
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
IMDbDict['connections'] = getMovieConnections(self.imdb)
IMDbDict['locations'] = getMovieLocations(self.imdb)
IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
IMDbDict['business'] = getMovieBusinessSum(self.imdb)
IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
IMDbDict['stills'] = getMovieStills(self.imdb)
#IMDbDict['trailer'] = getMovieTrailer(self.imdb)
self.IMDbDict = IMDbDict
if IMDbDict['episode_of']:
episode_of = getMovieInfo(IMDbDict['episode_of'])
for key in ('country', 'language'):
if not IMDbDict[key]:
IMDbDict[key] = episode_of[key]
return self.IMDbDict
def getCredits(self):
raw_credits = getMovieCredits(self.imdb)
credits = {}
def getNames(creditList):
return [stripTags(decodeHtml(c[0])) for c in creditList]
credits['director'] = getNames(raw_credits.get('directors', ''))
credits['writer'] = getNames(raw_credits.get('writers', ''))
credits['producer'] = getNames(raw_credits.get('producers', ''))
credits['cinematographer'] = getNames(raw_credits.get('cinematographers', ''))
credits['editor'] = getNames(raw_credits.get('editors', ''))
credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])]
self.credits = credits
return self.credits
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]
title = title.split('(')[0]
title = title.split('.')[0]
title = title.strip()
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
return_url = ''
#lest first try google
#i.e. site:imdb.com Michael Stevens Sin
if director:
search = 'site:imdb.com %s "%s"' % (director, title)
else:
search = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(search, 2):
if url.startswith('http://www.imdb.com/title/tt'):
return normalizeImdbId(int(oxlib.intValue(url)))
try:
req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
u.close()
except:
return None
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
if data:
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?- .*?(.*?)')
filmo = data.split(u'
Additional Details
')[0]
movies = {}
for part in filmo.split(u'(.*?):')
section = decodeHtml(section)
movies[section] = re.compile(u'href="/title/tt(\d{7})/"').findall(part)
info['movies'] = movies
return info
if __name__ == '__main__':
import sys
#print parse(sys.argv[1])
print "imdb:", guess(sys.argv[1])