2008-04-28 09:52:21 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
2008-06-19 09:47:02 +00:00
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
2008-04-28 09:52:21 +00:00
|
|
|
import urllib2
|
2008-04-29 19:09:10 +00:00
|
|
|
from urllib import quote, unquote
|
2008-06-19 09:47:02 +00:00
|
|
|
import re
|
2008-04-28 09:52:21 +00:00
|
|
|
import os
|
|
|
|
import time
|
|
|
|
|
|
|
|
from BeautifulSoup import BeautifulSoup
|
|
|
|
import chardet
|
2008-07-03 09:24:49 +00:00
|
|
|
import oxlib
|
|
|
|
from oxlib import stripTags, decodeHtml, findRe, findString
|
|
|
|
from oxlib.cache import getUrl, getUrlUnicode
|
|
|
|
from oxlib.normalize import normalizeTitle, normalizeImdbId
|
|
|
|
from oxlib import *
|
2008-04-28 09:52:21 +00:00
|
|
|
|
|
|
|
import google
|
|
|
|
|
2008-06-17 11:07:53 +00:00
|
|
|
|
2008-04-29 16:12:27 +00:00
|
|
|
def getMovieId(title, director='', year=''):
|
2008-06-19 09:47:02 +00:00
|
|
|
'''
|
|
|
|
>>> getMovieId('The Matrix')
|
|
|
|
'0133093'
|
|
|
|
'''
|
|
|
|
if year:
|
|
|
|
title = "%s (%s)" % (title, year)
|
|
|
|
if director:
|
|
|
|
query = 'site:imdb.com %s "%s"' % (director, title)
|
|
|
|
else:
|
|
|
|
query = 'site:imdb.com "%s"' % title
|
|
|
|
for (name, url, desc) in google.find(query, 3):
|
|
|
|
if url.startswith('http://www.imdb.com/title/tt'):
|
|
|
|
return url[28:35]
|
|
|
|
return ''
|
2008-04-29 16:12:27 +00:00
|
|
|
|
|
|
|
def getMovieData(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
return IMDb(imdbId).parse()
|
2008-04-29 16:12:27 +00:00
|
|
|
|
|
|
|
# internal functions below
|
|
|
|
def getUrlBase(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
return "http://www.imdb.com/title/tt%s" % imdbId
|
2008-04-29 16:12:27 +00:00
|
|
|
|
|
|
|
def getRawMovieData(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
imdbId = normalizeImdbId(imdbId)
|
|
|
|
data = getMovieInfo(imdbId)
|
|
|
|
data['credits'] = getMovieCredits(imdbId)
|
|
|
|
data['poster'] = getMoviePoster(imdbId)
|
|
|
|
data['company credits'] = getMovieCompanyCredits(imdbId)
|
|
|
|
data['filming locations'] = getMovieLocations(imdbId)
|
|
|
|
data['movie connections'] = getMovieConnections(imdbId)
|
|
|
|
data['external reviews'] = getMovieExternalReviews(imdbId)
|
|
|
|
data['trivia'] = getMovieTrivia(imdbId)
|
|
|
|
data['keywords'] = getMovieKeywords(imdbId)
|
|
|
|
data['media'] = {}
|
|
|
|
data['media']['images'] = getMovieImages(imdbId)
|
|
|
|
data['media']['trailers'] = getMovieTrailers(imdbId)
|
|
|
|
data['plotsummary'] = getMoviePlot(imdbId)
|
|
|
|
data['release dates'] = getMovieReleaseDates(imdbId)
|
|
|
|
data['release date'] = getMovieReleaseDate(imdbId)
|
|
|
|
return data
|
2008-04-30 13:31:50 +00:00
|
|
|
|
|
|
|
def getMovieInfo(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
data = getUrlUnicode(getUrlBase(imdbId))
|
|
|
|
info = dict()
|
|
|
|
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
|
|
|
|
if info['poster'] and '_V' in info['poster']:
|
|
|
|
info['poster']= "%s.jpg" % info['poster'].split('._V')[0]
|
|
|
|
|
|
|
|
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
|
|
|
|
title = stripTags(i[0]).strip().lower()
|
|
|
|
txt= stripTags(i[1]).strip()
|
|
|
|
def cleanUp(k):
|
|
|
|
k = decodeHtml(k).replace(u'\xa0', ' ').strip()
|
|
|
|
if k.endswith('more'): k=k[:-len('more')].strip()
|
|
|
|
return k
|
|
|
|
txt = cleanUp(txt)
|
2008-07-05 13:35:46 +00:00
|
|
|
if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline', 'original air date'):
|
2008-06-19 09:47:02 +00:00
|
|
|
if '|' in txt:
|
|
|
|
txt = [cleanUp(k) for k in txt.split('|')]
|
|
|
|
elif ', ' in txt:
|
|
|
|
txt = [cleanUp(k) for k in txt.split(', ')]
|
2008-07-16 18:05:30 +00:00
|
|
|
elif title in ('country', 'language'):
|
|
|
|
txt = [cleanUp(txt), ]
|
2008-07-29 17:04:23 +00:00
|
|
|
if title == 'tv series':
|
|
|
|
info['series_imdb'] = findRe(i[1], 'tt(\d{7})')
|
2008-07-05 13:35:46 +00:00
|
|
|
if title == 'original air date':
|
2008-07-13 13:31:16 +00:00
|
|
|
info['series_episode_info'] = txt.split('\n')[-1].strip()
|
2008-07-05 13:35:46 +00:00
|
|
|
txt = txt.split('\n')[0].strip()
|
2008-06-19 09:47:02 +00:00
|
|
|
if not title.startswith('moviemeter'):
|
|
|
|
info[title] = txt
|
2008-07-05 13:35:46 +00:00
|
|
|
for key in ('user comments', 'writers (wga)', 'plot keywords'):
|
2008-06-19 09:47:02 +00:00
|
|
|
if key in info:
|
|
|
|
del info[key]
|
|
|
|
if 'release date' in info:
|
|
|
|
info['release date'] = info['release date'].split('\n')[0]
|
|
|
|
if 'plot' in info:
|
|
|
|
info['plot'] = info['plot'].split('| add synopsis')[0].strip()
|
|
|
|
info['plot'] = info['plot'].split('| full synopsis')[0].strip()
|
|
|
|
if info['plot'] in ('add synopsis', 'full synopsis'):
|
|
|
|
info['plot'] = ''
|
|
|
|
|
|
|
|
#get Title
|
|
|
|
title = ''
|
|
|
|
year = ''
|
|
|
|
html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
|
|
|
|
if not html_title:
|
|
|
|
html_title = findRe(data, '<title>(.*?)</title>')
|
2008-09-18 08:18:46 +00:00
|
|
|
else:
|
|
|
|
html_title = html_title.split('<span class="pro-link">')[0]
|
2008-06-19 09:47:02 +00:00
|
|
|
if html_title:
|
|
|
|
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
|
|
|
title = decodeHtml(html_title)
|
|
|
|
title = stripTags(title)
|
|
|
|
year = findRe(title, '\((\d{4})\)')
|
|
|
|
if not year:
|
|
|
|
year = findRe(title, '\((\d{4})')
|
|
|
|
_y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))')
|
|
|
|
if _y:
|
|
|
|
title = title.replace(_y, '')
|
|
|
|
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
|
|
|
title = title.replace(t, '')
|
|
|
|
title = title.strip()
|
|
|
|
if title.find(u'\xa0') > -1:
|
|
|
|
title = title[:title.find(u'\xa0')].strip()
|
|
|
|
if title.startswith('"') and title.endswith('"'):
|
|
|
|
title = title[1:-1]
|
2008-07-05 13:35:46 +00:00
|
|
|
info['title'] = normalizeTitle(title)
|
2008-06-19 09:47:02 +00:00
|
|
|
info['year'] = year
|
2008-07-13 13:31:16 +00:00
|
|
|
|
|
|
|
#Series
|
2008-07-05 13:35:46 +00:00
|
|
|
if title.startswith('"') and title.find('"',1) > 0 and \
|
|
|
|
title.find('"',1) == title.rfind('"'):
|
|
|
|
episode_title = title[title.rfind('"')+1:]
|
|
|
|
episode_title = re.sub("\?{4}", "", episode_title).strip()
|
|
|
|
episode_title = re.sub("\d{4}", "", episode_title).strip()
|
|
|
|
if episode_title == '-': episode_title=''
|
|
|
|
title = normalizeTitle(title[1:title.rfind('"')])
|
|
|
|
if episode_title:
|
|
|
|
info['episode title'] = episode_title
|
|
|
|
info['series title'] = title
|
|
|
|
info['title'] = "%s: %s" % (title, episode_title)
|
|
|
|
else:
|
|
|
|
info['title'] = title
|
|
|
|
|
2008-07-13 13:31:16 +00:00
|
|
|
se = re.compile("Season (\d*), Episode (\d*)\)").findall(info.get('series_episode_info', ''))
|
2008-07-05 13:35:46 +00:00
|
|
|
if se:
|
|
|
|
info['season'] = int(se[0][0])
|
|
|
|
info['episode'] = int(se[0][1])
|
|
|
|
info['title'] = "%s (S%02dE%02d) %s" % (
|
|
|
|
info['series title'], info['season'], info['episode'], info['episode title'])
|
|
|
|
info['title'] = info['title'].strip()
|
2008-07-13 13:31:16 +00:00
|
|
|
del info['series_episode_info']
|
2008-06-19 09:47:02 +00:00
|
|
|
|
|
|
|
#Rating
|
|
|
|
rating = findRe(data, '<b>([\d\.]*?)/10</b>')
|
|
|
|
if rating:
|
|
|
|
info['rating'] = float(rating)
|
|
|
|
else:
|
|
|
|
info['rating'] = -1
|
|
|
|
|
|
|
|
#Votes
|
|
|
|
votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
|
|
|
|
if votes:
|
|
|
|
info['votes'] = int(votes.replace(',', ''))
|
|
|
|
else:
|
|
|
|
info['votes'] = -1
|
2008-07-05 13:35:46 +00:00
|
|
|
|
2008-06-19 09:47:02 +00:00
|
|
|
return info
|
2008-04-29 22:15:28 +00:00
|
|
|
|
2008-07-05 13:35:46 +00:00
|
|
|
|
2008-04-30 13:31:50 +00:00
|
|
|
def getMoviePoster(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
info = getMovieInfo(imdbId)
|
|
|
|
return info['poster']
|
2008-04-29 22:15:28 +00:00
|
|
|
|
2008-04-30 13:31:50 +00:00
|
|
|
def getMovieYear(imdbId):
|
2008-07-05 13:35:46 +00:00
|
|
|
'''
|
|
|
|
>>> getMovieYear('0315404')
|
|
|
|
u'1964'
|
|
|
|
|
|
|
|
>>> getMovieYear('0734840')
|
|
|
|
u'1990'
|
|
|
|
|
|
|
|
>>> getMovieYear('0815352')
|
|
|
|
u'1964'
|
|
|
|
'''
|
2008-06-19 09:47:02 +00:00
|
|
|
info = getMovieInfo(imdbId)
|
|
|
|
return info['year']
|
2008-04-30 13:31:50 +00:00
|
|
|
|
|
|
|
def getMovieTitle(imdbId):
|
2008-07-05 13:35:46 +00:00
|
|
|
'''
|
|
|
|
>>> getMovieTitle('0306414')
|
|
|
|
u'The Wire'
|
|
|
|
|
|
|
|
>>> getMovieTitle('0734840')
|
|
|
|
u'Twin Peaks (S01E02) Episode #1.2'
|
|
|
|
|
|
|
|
>>> getMovieTitle('0734840')
|
|
|
|
u'Twin Peaks (S01E02) Episode #1.2'
|
|
|
|
|
|
|
|
>>> getMovieTitle('0749451')
|
|
|
|
u'The Wire (S01E01) The Target'
|
|
|
|
'''
|
2008-06-19 09:47:02 +00:00
|
|
|
info = getMovieInfo(imdbId)
|
|
|
|
return info['title']
|
2008-04-29 16:12:27 +00:00
|
|
|
|
|
|
|
def creditList(data, section=None):
|
2008-06-19 09:47:02 +00:00
|
|
|
if section == 'cast':
|
|
|
|
credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
|
|
|
|
else:
|
|
|
|
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
|
|
|
|
credits = []
|
|
|
|
for c_ in credits_:
|
|
|
|
c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()]
|
|
|
|
if section=='writers':
|
|
|
|
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
|
|
|
|
if c[1].endswith(' and'): c[1] = c[1][:-4]
|
|
|
|
credits.append(c)
|
|
|
|
return credits
|
2008-04-29 16:12:27 +00:00
|
|
|
|
2008-04-30 13:31:50 +00:00
|
|
|
def getMovieCredits(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
credits = dict()
|
|
|
|
url = "%s/fullcredits" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
groups = data.split('<h5>')
|
|
|
|
for g in groups:
|
|
|
|
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
|
|
|
|
if section:
|
|
|
|
credits[section[0]] = creditList(g, section[0])
|
|
|
|
return credits
|
2008-04-29 16:12:27 +00:00
|
|
|
|
2008-04-29 19:09:10 +00:00
|
|
|
def getMovieTrailers(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/trailers" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
soup = BeautifulSoup(data)
|
|
|
|
videos = soup('div', {'class':"video-gallery"})
|
|
|
|
trailers = []
|
|
|
|
if videos:
|
|
|
|
for a in videos[0]('a'):
|
|
|
|
title = stripTags(unicode(a)).strip()
|
|
|
|
url = 'http://www.imdb.com' + a['href']
|
|
|
|
videoId = findRe(url, '/(vi\d*?)/')
|
|
|
|
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
|
|
|
|
iframe = getUrlUnicode(iframeUrl)
|
|
|
|
videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
|
|
|
|
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
|
|
|
|
return trailers
|
2008-04-28 09:52:21 +00:00
|
|
|
|
2008-04-29 22:15:28 +00:00
|
|
|
def getMovieQuotes(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/quotes" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
|
|
|
|
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
|
|
|
|
return quotes
|
2008-04-29 22:15:28 +00:00
|
|
|
|
2008-05-23 11:08:40 +00:00
|
|
|
def getMoviePlot(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/plotsummary" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
plot = findRe(data, '<p class="plotpar">(.*?)<i>')
|
|
|
|
return plot
|
2008-05-23 11:08:40 +00:00
|
|
|
|
2008-04-29 22:15:28 +00:00
|
|
|
def getMovieTechnical(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/technical" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
results = {}
|
|
|
|
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
|
|
|
|
results[t[0].strip()] = t[1].strip()
|
|
|
|
return results
|
2008-04-29 22:15:28 +00:00
|
|
|
|
|
|
|
def getMovieCompanyCredits(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/companycredits" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
results = {}
|
|
|
|
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
|
|
|
|
results[field.strip()] = []
|
|
|
|
for company in re.compile('<li>(.*?)</li>').findall(c):
|
|
|
|
results[field.strip()].append(company)
|
|
|
|
return results
|
2008-04-29 22:15:28 +00:00
|
|
|
|
|
|
|
def getMovieLocations(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/locations" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
soup = BeautifulSoup(data)
|
|
|
|
locations = []
|
|
|
|
for key in soup('a', {'href': re.compile('^/List')}):
|
|
|
|
locations.append(decodeHtml(key.string))
|
|
|
|
return locations
|
2008-04-29 22:15:28 +00:00
|
|
|
|
|
|
|
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
|
2008-06-19 09:47:02 +00:00
|
|
|
photos = {}
|
|
|
|
for key in keys:
|
|
|
|
url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
photos[key] = {}
|
|
|
|
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
|
|
|
|
img= "%s.jpg" % s[1].split('._V')[0]
|
|
|
|
title = s[0]
|
|
|
|
if key=='still_frame':
|
|
|
|
if not "_CR0" in s[1]:
|
|
|
|
photos[key][img] = title
|
|
|
|
else:
|
|
|
|
photos[key][img] = title
|
|
|
|
return photos
|
2008-04-29 22:15:28 +00:00
|
|
|
|
2008-04-29 19:09:10 +00:00
|
|
|
def getMovieStills(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
return getMovieImages(imdbId, ['still_frame'])['still_frame']
|
2008-04-29 22:15:28 +00:00
|
|
|
|
|
|
|
def getMoviePosters(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
posters = getMovieImages(imdbId, ['poster'])['poster']
|
|
|
|
poster = getMoviePoster(imdbId)
|
|
|
|
if poster:
|
|
|
|
posters[poster] = 'main poster'
|
|
|
|
return posters
|
2008-05-10 08:29:15 +00:00
|
|
|
|
2008-04-29 22:15:28 +00:00
|
|
|
def getMovieTrivia(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/trivia" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
soup = BeautifulSoup(data)
|
|
|
|
trivia = []
|
|
|
|
triviaList = []
|
|
|
|
for i in soup('ul', {'class': "trivia"}):
|
|
|
|
for t in i('li'):
|
|
|
|
t = unicode(t).replace('<br />', '').strip()
|
|
|
|
if t.startswith('<li>') and t.endswith('</li>'):
|
|
|
|
t = t[4:-5].strip()
|
|
|
|
t=decodeHtml(t)
|
|
|
|
trivia.append(t)
|
|
|
|
return trivia
|
2008-04-29 19:09:10 +00:00
|
|
|
|
2008-04-30 13:31:50 +00:00
|
|
|
def getMovieConnections(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/movieconnections" % getUrlBase(imdbId)
|
|
|
|
data = getUrl(url)
|
|
|
|
connections={}
|
|
|
|
for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
|
|
|
|
connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
|
|
|
|
return connections
|
2008-04-30 13:31:50 +00:00
|
|
|
|
|
|
|
def getMovieKeywords(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/keywords" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
keywords = []
|
|
|
|
for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
|
|
|
|
keyword = decodeHtml(keyword)
|
|
|
|
keyword = keyword.replace(u'\xa0', ' ')
|
|
|
|
keywords.append(keyword)
|
|
|
|
return keywords
|
2008-04-30 13:31:50 +00:00
|
|
|
|
|
|
|
def getMovieExternalReviews(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/externalreviews" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
soup = BeautifulSoup(data)
|
|
|
|
ol = soup('ol')
|
|
|
|
if ol:
|
|
|
|
ol = ol[0]
|
|
|
|
ret = {}
|
|
|
|
for li in ol('li'):
|
|
|
|
try:
|
|
|
|
a = li('a')[0]
|
|
|
|
href = a.get('href')
|
|
|
|
txt = a.contents[0]
|
|
|
|
ret[href] = txt
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
return ret
|
|
|
|
return {}
|
2008-04-30 13:31:50 +00:00
|
|
|
|
2008-05-25 17:29:14 +00:00
|
|
|
def getMovieReleaseDate(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
releasedates = getMovieReleaseDates(imdbId)
|
2008-07-02 16:55:41 +00:00
|
|
|
first_release = None
|
2008-06-19 09:47:02 +00:00
|
|
|
for r in releasedates:
|
|
|
|
if not first_release or r[1] < first_release:
|
|
|
|
first_release = r[1]
|
|
|
|
return first_release
|
2008-05-25 17:29:14 +00:00
|
|
|
|
2008-07-01 12:40:36 +00:00
|
|
|
def _parseDate(d):
|
2008-10-04 15:17:13 +00:00
|
|
|
'''
|
|
|
|
>>>_parseDate('3 March 1972')
|
|
|
|
'1972-03-03'
|
|
|
|
'''
|
2008-07-01 12:40:36 +00:00
|
|
|
try:
|
|
|
|
parsed_date = time.strptime(d, "%d %B %Y")
|
2008-10-04 13:57:23 +00:00
|
|
|
parsed_date = '%s-%02d-%02d' % (parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday)
|
2008-07-01 12:40:36 +00:00
|
|
|
return parsed_date
|
|
|
|
except:
|
|
|
|
try:
|
|
|
|
parsed_date = time.strptime(d, "%B %Y")
|
2008-10-04 13:57:23 +00:00
|
|
|
parsed_date = '%s-%02d-01' % (parsed_date.tm_year, parsed_date.tm_mon)
|
2008-07-01 12:40:36 +00:00
|
|
|
return parsed_date
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
try:
|
|
|
|
parsed_date = time.strptime(d, "%Y")
|
2008-10-04 13:57:23 +00:00
|
|
|
parsed_date = '%s-01-01' % (parsed_date.tm_year)
|
2008-07-01 12:40:36 +00:00
|
|
|
return parsed_date
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
return d
|
|
|
|
|
2008-05-25 17:29:14 +00:00
|
|
|
def getMovieReleaseDates(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/releaseinfo" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
releasedates = []
|
|
|
|
regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
|
2008-05-25 17:29:14 +00:00
|
|
|
|
2008-06-19 09:47:02 +00:00
|
|
|
for r in re.compile(regexp, re.DOTALL).findall(data):
|
|
|
|
r_ = (stripTags(r[0]).strip(),
|
2008-07-01 12:40:36 +00:00
|
|
|
_parseDate(stripTags(r[1]).strip()),
|
2008-06-19 09:47:02 +00:00
|
|
|
decodeHtml(stripTags(r[2]).strip()))
|
|
|
|
releasedates.append(r_)
|
|
|
|
return releasedates
|
2008-05-25 17:29:14 +00:00
|
|
|
|
|
|
|
def getMovieBusinessSum(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
business = getMovieBusiness(imdbId)
|
|
|
|
b_ = {'budget': 0, 'gross': 0, 'profit': 0}
|
|
|
|
if 'budget' in business:
|
2008-09-30 16:05:19 +00:00
|
|
|
#b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
|
2008-09-30 16:30:40 +00:00
|
|
|
budget = filter(lambda x: x.startswith('$'), business['budget'])
|
|
|
|
if not budget:
|
|
|
|
budget = business['budget']
|
|
|
|
b_['budget'] = int(intValue(budget[0].replace(',', '')))
|
|
|
|
|
2008-06-19 09:47:02 +00:00
|
|
|
if 'gross' in business:
|
2008-09-30 16:05:19 +00:00
|
|
|
b_['gross'] = int(intValue(business['gross'][0].replace(',', '')))
|
|
|
|
#b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
|
|
|
|
#if 'weekend gross' in business:
|
|
|
|
# b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
|
2008-06-19 09:47:02 +00:00
|
|
|
if b_['budget'] and b_['gross']:
|
|
|
|
b_['profit'] = b_['gross'] - b_['budget']
|
|
|
|
return b_
|
2008-05-25 17:29:14 +00:00
|
|
|
|
|
|
|
def getMovieFlimingDates(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
business = getMovieBusiness(imdbId)
|
|
|
|
if 'filming dates' in business and business['filming dates']:
|
|
|
|
return business['filming dates'][0]
|
|
|
|
return ''
|
2008-05-25 17:29:14 +00:00
|
|
|
|
|
|
|
def getMovieBusiness(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/business" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
business = {}
|
|
|
|
for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
|
|
|
|
key = stripTags(r[0]).strip().lower()
|
|
|
|
value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('<br/>')]
|
|
|
|
business[key] = value
|
|
|
|
return business
|
2008-05-25 17:29:14 +00:00
|
|
|
|
|
|
|
def getMovieEpisodes(imdbId):
|
2008-06-19 09:47:02 +00:00
|
|
|
url = "%s/episodes" % getUrlBase(imdbId)
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
episodes = {}
|
|
|
|
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
|
|
|
|
for r in re.compile(regexp, re.DOTALL).findall(data):
|
|
|
|
try:
|
|
|
|
episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
|
|
|
|
episodes[episode] = {}
|
|
|
|
episodes[episode]['imdb'] = r[2]
|
|
|
|
episodes[episode]['title'] = r[3].strip()
|
|
|
|
if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
|
|
|
|
episodes[episode]['title'] = u''
|
|
|
|
description = decodeHtml(r[5])
|
|
|
|
description = stripTags(description.split('Next US airings:')[0])
|
|
|
|
episodes[episode]['description'] = description.strip()
|
|
|
|
episodes[episode]['date'] = ''
|
|
|
|
try:
|
|
|
|
d = stripTags(r[4])
|
|
|
|
d = d.replace('Original Air Date: ', '')
|
|
|
|
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
|
|
|
|
episodes[episode]['date'] = d
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
except:
|
|
|
|
import traceback
|
|
|
|
print traceback.print_exc()
|
|
|
|
pass
|
|
|
|
return episodes
|
2008-05-25 17:29:14 +00:00
|
|
|
|
2008-04-29 19:09:10 +00:00
|
|
|
'''the old code below'''
|
2008-04-28 09:52:21 +00:00
|
|
|
|
|
|
|
class IMDb:
|
2008-06-19 09:47:02 +00:00
|
|
|
def __init__(self, imdbId):
|
|
|
|
self.imdb = imdbId
|
|
|
|
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
|
|
|
|
|
|
|
def getPage(self):
|
|
|
|
return getUrlUnicode(self.pageUrl)
|
|
|
|
|
|
|
|
def parse_raw_value(self, key, value):
|
|
|
|
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
|
|
|
|
value = stripTags(value).strip()
|
|
|
|
if key == 'runtime':
|
|
|
|
parsed_value = findRe(value, '(.*?) min')
|
|
|
|
parsed_value = findRe(parsed_value, '([0-9]+)')
|
|
|
|
if not parsed_value:
|
|
|
|
parsed_value = findRe(value, '(.*?) sec')
|
|
|
|
parsed_value = findRe(parsed_value, '([0-9]+)')
|
|
|
|
if not parsed_value:
|
|
|
|
parsed_value = 0
|
|
|
|
else:
|
|
|
|
parsed_value = int(parsed_value)
|
|
|
|
else:
|
|
|
|
parsed_value = int(parsed_value) * 60
|
|
|
|
elif key in ('country', 'language'):
|
|
|
|
parsed_value = value.split(' / ')
|
|
|
|
if len(parsed_value) == 1:
|
|
|
|
parsed_value = parsed_value[0].split(' | ')
|
|
|
|
parsed_value = [v.strip() for v in parsed_value]
|
|
|
|
elif key == 'genre':
|
|
|
|
parsed_value = value.replace('more', '').strip().split(' / ')
|
|
|
|
if len(parsed_value) == 1:
|
|
|
|
parsed_value = parsed_value[0].split(' | ')
|
|
|
|
parsed_value = [v.strip() for v in parsed_value]
|
|
|
|
elif key == 'tagline':
|
|
|
|
parsed_value = value.replace('more', '').strip()
|
|
|
|
elif key == 'plot_outline':
|
|
|
|
parsed_value = value.replace('(view trailer)', '').strip()
|
|
|
|
if parsed_value.endswith('more'):
|
|
|
|
parsed_value = parsed_value[:-4].strip()
|
|
|
|
elif key == 'tv_series':
|
|
|
|
m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
|
|
|
|
if m:
|
|
|
|
parsed_value = m[0][0]
|
|
|
|
else:
|
|
|
|
parsed_value = ''
|
|
|
|
elif key == 'also_known_as':
|
|
|
|
parsed_value = ''
|
|
|
|
m = re.compile('(.*) \(International: English title').findall(value)
|
|
|
|
if m:
|
|
|
|
parsed_value = m[0]
|
|
|
|
else:
|
|
|
|
m = re.compile('(.*) \(USA').findall(value)
|
|
|
|
if m:
|
|
|
|
parsed_value = m[0]
|
|
|
|
parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
|
|
|
|
director = self.getCredits().get('director', None)
|
|
|
|
if director:
|
|
|
|
director = director[0]
|
|
|
|
parsed_value = parsed_value.replace(director, '')
|
|
|
|
if parsed_value.startswith("'s"):
|
|
|
|
parsed_value = parsed_value[2:].strip()
|
|
|
|
parsed_value = decodeHtml(parsed_value.strip())
|
2008-04-28 09:52:21 +00:00
|
|
|
else:
|
2008-06-19 09:47:02 +00:00
|
|
|
print value
|
|
|
|
parsed_value = value
|
|
|
|
return parsed_value
|
|
|
|
|
|
|
|
def parseYear(self):
|
|
|
|
year = ''
|
|
|
|
data = self.getPage()
|
|
|
|
soup = BeautifulSoup(data)
|
|
|
|
html_title = soup('div', {'id': 'tn15title'})
|
|
|
|
if not html_title:
|
|
|
|
html_title = soup('title')
|
|
|
|
if html_title:
|
|
|
|
html_title = unicode(html_title[0])
|
|
|
|
html_title = stripTags(html_title)
|
|
|
|
year = re.compile('\((\d{4})\)').findall(html_title)
|
|
|
|
if not year:
|
|
|
|
year = re.compile('\((\d{4})/').findall(html_title)
|
|
|
|
if year:
|
|
|
|
year = year[0]
|
|
|
|
else: year = ''
|
|
|
|
return year
|
|
|
|
|
|
|
|
def parse(self):
|
|
|
|
data = self.getPage()
|
|
|
|
IMDbDict ={}
|
|
|
|
#Poster
|
|
|
|
IMDbDict['poster'] = getMoviePoster(self.imdb)
|
|
|
|
if not IMDbDict['poster']:
|
|
|
|
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
|
|
|
#Title, Year
|
|
|
|
IMDbDict['year'] = self.parseYear()
|
2008-07-05 13:35:46 +00:00
|
|
|
IMDbDict['title'] = getMovieTitle(self.imdb)
|
2008-06-19 09:47:02 +00:00
|
|
|
|
|
|
|
#Rating
|
|
|
|
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
|
2008-04-28 09:52:21 +00:00
|
|
|
if m:
|
2008-06-19 09:47:02 +00:00
|
|
|
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
|
|
|
|
else:
|
|
|
|
IMDbDict['rating'] = -1
|
|
|
|
#Votes
|
|
|
|
m = re.compile('<small>\(<a href="ratings">(.*?) votes</a>\)</small>', re.IGNORECASE).findall(data)
|
|
|
|
if m:
|
|
|
|
IMDbDict['votes'] = int(m[0].replace(',', ''))
|
|
|
|
else:
|
|
|
|
IMDbDict['votes'] = -1
|
|
|
|
|
|
|
|
data = data.replace('\n',' ')
|
|
|
|
#some values
|
|
|
|
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
|
|
|
|
for key in keys:
|
|
|
|
IMDbDict[key] = ''
|
|
|
|
IMDbDict['runtime'] = 0
|
|
|
|
soup = BeautifulSoup(data)
|
|
|
|
for info in soup('div', {'class': 'info'}):
|
|
|
|
key = unicode(info).split('</h5>')[0].split('<h5>')
|
|
|
|
if len(key) > 1:
|
|
|
|
raw_value = unicode(info).split('</h5>')[1]
|
|
|
|
key = key[1][:-1].lower().replace(' ', '_')
|
|
|
|
if key in keys:
|
|
|
|
IMDbDict[key] = self.parse_raw_value(key, raw_value)
|
|
|
|
IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
|
|
|
|
#is episode
|
|
|
|
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
|
|
|
|
|
|
|
|
IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
|
|
|
|
if IMDbDict['episodes']:
|
|
|
|
IMDbDict['tvshow'] = True
|
|
|
|
else:
|
|
|
|
IMDbDict['tvshow'] = False
|
|
|
|
IMDbDict['credits'] = self.getCredits()
|
|
|
|
IMDbDict['plot'] = getMoviePlot(self.imdb)
|
|
|
|
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
|
|
|
|
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
|
|
|
IMDbDict['connections'] = getMovieConnections(self.imdb)
|
|
|
|
IMDbDict['locations'] = getMovieLocations(self.imdb)
|
|
|
|
IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
|
|
|
|
IMDbDict['business'] = getMovieBusinessSum(self.imdb)
|
|
|
|
IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
|
|
|
|
IMDbDict['stills'] = getMovieStills(self.imdb)
|
|
|
|
#IMDbDict['trailer'] = getMovieTrailer(self.imdb)
|
|
|
|
self.IMDbDict = IMDbDict
|
|
|
|
|
|
|
|
if IMDbDict['episode_of']:
|
2008-07-16 18:05:30 +00:00
|
|
|
episode_of = getMovieInfo(IMDbDict['episode_of'])
|
2008-06-19 09:47:02 +00:00
|
|
|
for key in ('country', 'language'):
|
|
|
|
if not IMDbDict[key]:
|
|
|
|
IMDbDict[key] = episode_of[key]
|
|
|
|
return self.IMDbDict
|
|
|
|
|
|
|
|
def getCredits(self):
|
|
|
|
raw_credits = getMovieCredits(self.imdb)
|
|
|
|
credits = {}
|
|
|
|
|
|
|
|
def getNames(creditList):
|
|
|
|
return [stripTags(decodeHtml(c[0])) for c in creditList]
|
|
|
|
|
|
|
|
credits['director'] = getNames(raw_credits.get('directors', ''))
|
|
|
|
credits['writer'] = getNames(raw_credits.get('writers', ''))
|
|
|
|
credits['producer'] = getNames(raw_credits.get('producers', ''))
|
|
|
|
credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])]
|
|
|
|
|
|
|
|
self.credits = credits
|
|
|
|
return self.credits
|
2008-04-29 16:12:27 +00:00
|
|
|
|
2008-04-28 09:52:21 +00:00
|
|
|
|
2008-06-19 09:47:02 +00:00
|
|
|
def guess(title, director=''):
|
|
|
|
#FIXME: proper file -> title
|
|
|
|
title = title.split('-')[0]
|
|
|
|
title = title.split('(')[0]
|
|
|
|
title = title.split('.')[0]
|
|
|
|
title = title.strip()
|
|
|
|
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
|
|
|
|
return_url = ''
|
|
|
|
|
|
|
|
#lest first try google
|
|
|
|
#i.e. site:imdb.com Michael Stevens Sin
|
|
|
|
if director:
|
|
|
|
search = 'site:imdb.com %s "%s"' % (director, title)
|
2008-04-28 09:52:21 +00:00
|
|
|
else:
|
2008-06-19 09:47:02 +00:00
|
|
|
search = 'site:imdb.com "%s"' % title
|
|
|
|
for (name, url, desc) in google.find(search, 2):
|
|
|
|
if url.startswith('http://www.imdb.com/title/tt'):
|
2008-07-29 17:04:23 +00:00
|
|
|
return normalizeImdbId(int(oxlib.intValue(url)))
|
2008-04-28 09:52:21 +00:00
|
|
|
|
2008-06-19 09:47:02 +00:00
|
|
|
try:
|
2008-07-03 09:24:49 +00:00
|
|
|
req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
|
2008-06-19 09:47:02 +00:00
|
|
|
u = urllib2.urlopen(req)
|
|
|
|
data = u.read()
|
|
|
|
return_url = u.url
|
|
|
|
u.close()
|
|
|
|
except:
|
|
|
|
return None
|
|
|
|
if return_url.startswith('http://www.imdb.com/title/tt'):
|
|
|
|
return return_url[28:35]
|
|
|
|
if data:
|
|
|
|
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
|
|
|
if imdb_id:
|
|
|
|
return imdb_id
|
|
|
|
|
|
|
|
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
|
2008-07-03 09:24:49 +00:00
|
|
|
req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
|
2008-04-28 09:52:21 +00:00
|
|
|
u = urllib2.urlopen(req)
|
|
|
|
data = u.read()
|
|
|
|
return_url = u.url
|
|
|
|
u.close()
|
2008-06-19 09:47:02 +00:00
|
|
|
if return_url.startswith('http://www.imdb.com/title/tt'):
|
|
|
|
return return_url[28:35]
|
|
|
|
|
2008-04-28 09:52:21 +00:00
|
|
|
return None
|
|
|
|
|
|
|
|
def getEpisodeData(title, episode, show_url = None):
|
2008-06-19 09:47:02 +00:00
|
|
|
'''
|
|
|
|
Collect information about an episode.
|
|
|
|
|
|
|
|
Returns dict with title, show, description and episode
|
|
|
|
'''
|
|
|
|
episodeData = {
|
|
|
|
'title': u'',
|
|
|
|
'show': title,
|
|
|
|
'description': u'',
|
|
|
|
'episode': episode,
|
|
|
|
}
|
|
|
|
description = u''
|
|
|
|
if not show_url:
|
|
|
|
imdbid = guess(title)
|
|
|
|
else:
|
|
|
|
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
|
|
|
|
if imdbid:
|
|
|
|
i = IMDb(imdbid).parse()
|
|
|
|
episodeData['title'] = i['episodes'][episode]['title']
|
|
|
|
episodeData['description'] = i['episodes'][episode]['description']
|
|
|
|
episodeData['imdb'] = i['episodes'][episode]['imdb']
|
|
|
|
return episodeData
|
2008-04-28 09:52:21 +00:00
|
|
|
|
2008-09-30 14:00:21 +00:00
|
|
|
def getPersonData(imdbId):
|
|
|
|
imdbId = normalizeImdbId(imdbId)
|
|
|
|
url = u'http://www.imdb.com/name/nm%s/' % imdbId
|
|
|
|
data = getUrlUnicode(url)
|
|
|
|
info = dict()
|
|
|
|
info['name'] = findRe(data, u'<title>(.*?)</title>')
|
|
|
|
filmo = data.split(u'<h3>Additional Details</h3>')[0]
|
|
|
|
movies = {}
|
|
|
|
for part in filmo.split(u'<div class="filmo"')[1:]:
|
|
|
|
section = findRe(part, u'a name=".*?">(.*?):</a></h5>')
|
|
|
|
section = decodeHtml(section)
|
|
|
|
movies[section] = re.compile(u'href="/title/tt(\d{7})/"').findall(part)
|
|
|
|
info['movies'] = movies
|
|
|
|
return info
|
|
|
|
|
2008-04-28 09:52:21 +00:00
|
|
|
if __name__ == '__main__':
|
2008-06-19 09:47:02 +00:00
|
|
|
import sys
|
|
|
|
#print parse(sys.argv[1])
|
|
|
|
print "imdb:", guess(sys.argv[1])
|
|
|
|
|