- changes to imdb.py

* user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId
This commit is contained in:
j 2008-04-29 18:12:27 +02:00
parent 7a53ee62b9
commit 69adaeee00
3 changed files with 165 additions and 149 deletions

View File

@ -4,5 +4,7 @@
__version__ = '0.1.0'
from net import *
import imdb
import wikipedia
import google

View File

@ -17,7 +17,6 @@ from oxutils import stripTags
usage:
import google
google.find(query)
<generator object at 0x833aeac>
for result in google.find(query): result

View File

@ -12,19 +12,102 @@ import time
from BeautifulSoup import BeautifulSoup
import chardet
import oxutils
from oxutils import stripTags, htmldecode
from oxutils import stripTags, htmldecode, findRegexp
from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle
from oxutils.normalize import normalizeTitle, normalizeImdbId
import google
def _get_data(url):
data = None
try:
data = getUrl(url)
except:
print "error reading data from", url
return data
def getMovieId(title, director='', year=''):
if year:
title = "%s (%s)" % (title, year)
if director:
query = 'site:imdb.com %s "%s"' % (director, title)
else:
query = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(query, 3):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
def getMovieData(imdbId):
return IMDb(imdbId).parse()
# internal functions below
def getUrlBase(imdbId):
return "http://www.imdb.com/title/tt%s" % imdbId
def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId)
data = dict()
data['credits'] = parseCredits(imdbId)
data['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
def parseBase(imdbId):
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
info = dict()
for i in soup('div', {'class':'info'}):
title = i('h5')
if title:
title=title[0]
txt = title.findNext()
title = stripTags(unicode(title))
if title.endswith(':'):
title = title[:-1]
info[title] = htmldecode(stripTags(unicode(txt)))
return info
return soup
def getTitle(imdbId):
title = ''
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = htmldecode(html_title)
title = stripTags(title)
title = re.sub('\(\d\d\d\d\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')]
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
return title
def creditList(data, section=None):
if section == 'cast':
credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
else:
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
credits = []
for c_ in credits_:
c = [c_[0].strip(), c_[1].strip()]
if section=='writers':
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
if c[1].endswith(' and'): c[1] = c[1][:-4]
credits.append(c)
return credits
def parseCredits(imdbId):
credits = dict()
url = "%s/fullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
groups = data.split('<h5>')
for g in groups:
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
if section:
credits[section[0]] = creditList(g, section[0])
return credits
'''the old code below'''
def get_image(url):
return getUrl(url)
@ -42,62 +125,36 @@ def _castList(data, regexp):
return names
return []
def _getTerm(data, regexp):
term = ''
try:
reg = re.compile(regexp, re.IGNORECASE)
m = reg.search(data)
if m:
term = stripTags(m.group(1)).strip()
except:
print "waring, parsing failed for", regexp
return term.encode('utf8')
class IMDb:
def __init__(self, imdb):
self.imdb = imdb
self.pageSource = None
def __init__(self, imdbId):
self.imdb = imdbId
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
self.businessSource = None
self.businessUrl = "%sbusiness" % self.pageUrl
self.connectionsSource = None
self.connectionsUrl = "%smovieconnections" % self.pageUrl
self.creditsSource = None
self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesSource = None
self.episodesUrl = "%sepisodes" % self.pageUrl
self.keywordSource = None
self.keywordUrl = "%skeywords" % self.pageUrl
self.plotSource = None
self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoSource = None
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.triviaSource = None
self.triviaUrl = "%strivia" % self.pageUrl
self.locationSource = None
self.locationUrl = "%slocations" % self.pageUrl
self.externalreviewsSource = None
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
self.trailerSource = None
self.trailerUrl = "%strailers" % self.pageUrl
def getPage(self, forcereload = False):
if forcereload or not self.pageSource:
self.pageSource = getUrlUnicode(self.pageUrl)
return self.pageSource
def getPage(self):
return getUrlUnicode(self.pageUrl)
def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
value = unicode(value, 'utf-8')
value = stripTags(value).strip()
value = stripTags(value).strip()
if key == 'runtime':
parsed_value = _getTerm(value, '(.*?) min')
parsed_value = _getTerm(parsed_value, '([0-9]+)')
parsed_value = findRegexp(value, '(.*?) min')
parsed_value = findRegexp(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = _getTerm(value, '(.*?) sec')
parsed_value = _getTerm(parsed_value, '([0-9]+)')
parsed_value = findRegexp(value, '(.*?) sec')
parsed_value = findRegexp(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
else:
@ -141,7 +198,7 @@ class IMDb:
print value
parsed_value = value
return parsed_value
def parseTitle(self):
title = ''
data = self.getPage()
@ -153,8 +210,8 @@ class IMDb:
html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = stripTags(html_title)
title = re.sub('\(\d\d\d\d\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title)
title = re.sub('\(\d{4}\)', '', title)
title = re.sub('\(\d{4}/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
if title.find(u'\xa0') > -1:
@ -172,7 +229,7 @@ class IMDb:
else:
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
return normalizeTitle(title)
def parseYear(self):
year = ''
data = self.getPage()
@ -183,25 +240,25 @@ class IMDb:
if html_title:
html_title = str(html_title[0])
html_title = stripTags(html_title)
year = re.compile('\((\d\d\d\d)\)').findall(html_title)
year = re.compile('\((\d{4})\)').findall(html_title)
if not year:
year = re.compile('\((\d\d\d\d)/').findall(html_title)
if year:
year = re.compile('\((\d{4})/').findall(html_title)
if year:
year = year[0]
else: year = ''
return year
def parse(self):
data = self.getPage()
IMDbDict ={}
#Poster
IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
IMDbDict['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
#Title, Year
IMDbDict['year'] = self.parseYear()
IMDbDict['title'] = self.parseTitle()
#Rating
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
if m:
@ -251,49 +308,32 @@ class IMDb:
IMDbDict['stills'] = getMovieStills(self.imdb)
#IMDbDict['trailer'] = self.parseTrailer()
self.IMDbDict = IMDbDict
if IMDbDict['episode_of']:
episode_of =IMDb(IMDbDict['episode_of']).parse()
for key in ('country', 'language'):
if not IMDbDict[key]:
IMDbDict[key] = episode_of[key]
return self.IMDbDict
def getCredits(self, forcereload = False):
if forcereload or not self.creditsSource:
self.creditsSource = getUrlUnicode(self.creditsUrl)
return self.creditsSource
def parseCredits(self):
data = self.getCredits()
raw_credits = parseCredits(self.imdb)
credits = {}
credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
#credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
credits['cast'] = []
soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
soup = BeautifulSoup(data)
cast = soup('table', {'class': 'cast'})
if cast:
cast = str(cast[0]).replace(u'\xa0', ' ')
names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
for name in names:
real_name = name[0]
role_name = name[1]
if role_name:
role_name = role_name.split('(')[0].replace('/ ...','')
credits['cast'].append((stripTags(real_name), stripTags(role_name)))
def getNames(creditList):
return [stripTags(c[0]) for c in creditList]
credits['director'] = getNames(raw_credits['directors'])
credits['writer'] = getNames(raw_credits['writers'])
credits['producer'] = getNames(raw_credits['producers'])
credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']]
self.credits = credits
return self.credits
def getPlot(self, forcereload = False):
if forcereload or not self.plotSource:
self.plotSource = getUrlUnicode(self.plotUrl)
return self.plotSource
def parsePlot(self):
soup = BeautifulSoup(self.getPlot())
data = getUrlUnicode(self.plotUrl)
soup = BeautifulSoup(data)
plot = soup('p', {'class':'plotpar'})
if plot:
plot = unicode(plot[0]).split('<i>')[0]
@ -302,15 +342,11 @@ class IMDb:
plot = stripTags(plot).strip()
self.plot = plot
return plot
def getEpisodes(self, forcereload = False):
if forcereload or not self.episodesSource:
self.episodesSource = getUrlUnicode(self.episodesUrl)
return self.episodesSource
def parseEpisodes(self):
episodes = {}
cdata = self.getEpisodes().replace('\r\n', ' ')
data = getUrlUnicode(self.episodesUrl)
cdata = data.replace('\r\n', ' ')
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
reg = re.compile(regexp, re.IGNORECASE)
m = reg.findall(cdata)
@ -340,26 +376,18 @@ class IMDb:
self.episodes = episodes
return self.episodes
def getLocations(self, forcereload = False):
if forcereload or not self.locationSource:
self.keywordSource = getUrlUnicode(self.locationUrl)
return self.keywordSource
def parseLocations(self):
soup = BeautifulSoup(self.getLocations())
data = getUrlUnicode(self.locationUrl)
soup = BeautifulSoup(data)
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string))
self.locations = locations
return self.locations
def getKeywords(self, forcereload = False):
if forcereload or not self.keywordSource:
self.keywordSource = getUrlUnicode(self.keywordUrl)
return self.keywordSource
def parseKeywords(self):
soup = BeautifulSoup(self.getKeywords())
data = getUrlUnicode(self.keywordUrl)
soup = BeautifulSoup(data)
keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}):
k = htmldecode(key.string)
@ -368,28 +396,23 @@ class IMDb:
self.keywords = keywords
return self.keywords
def getTrivia(self, forcereload = False):
if forcereload or not self.triviaSource:
self.triviaSource = getUrlUnicode(self.triviaUrl)
return self.triviaSource
def parseTrivia(self):
data = getUrlUnicode(self.triviaUrl)
soup = BeautifulSoup(data)
trivia = []
soup = BeautifulSoup(self.getTrivia())
triviaList = []
for i in soup('ul', {'class': "trivia"}):
for t in i('li'):
t = str(t).replace('<br />', '').strip()
if t.startswith('<li>') and t.endswith('</li>'):
t = t[4:-5].strip()
t = t[4:-5].strip()
trivia.append(t)
self.trivia = trivia
return self.trivia
def getConnections(self, forcereload = False):
if forcereload or not self.connectionsSource:
self.connectionsSource = getUrlUnicode(self.connectionsUrl)
return self.connectionsSource
def getConnections(self):
return getUrlUnicode(self.connectionsUrl)
def parseConnections(self):
connections = {}
@ -404,10 +427,8 @@ class IMDb:
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
return connections
def getReleaseinfo(self, forcereload = False):
if forcereload or not self.releaseinfoSource:
self.releaseinfoSource = getUrlUnicode(self.releaseinfoUrl)
return self.releaseinfoSource
def getReleaseinfo(self):
return getUrlUnicode(self.releaseinfoUrl)
def parseReleaseinfo(self):
soup = BeautifulSoup(self.getReleaseinfo())
@ -424,12 +445,10 @@ class IMDb:
except:
pass
return None
def getBusiness(self, forcereload = False):
if forcereload or not self.businessSource:
self.businessSource = getUrlUnicode(self.businessUrl)
return self.businessSource
def getBusiness(self):
return getUrlUnicode(self.businessUrl)
def parseBusiness(self):
soup = BeautifulSoup(self.getBusiness())
business = {'budget': 0, 'gross': 0, 'profit': 0}
@ -449,12 +468,10 @@ class IMDb:
if business['budget'] and business['gross']:
business['profit'] = business['gross'] - business['budget']
return business
def getExternalreviews(self, forcereload = False):
if forcereload or not self.externalreviewsSource:
self.externalreviewsSource = getUrlUnicode(self.externalreviewsUrl)
return self.externalreviewsSource
def getExternalreviews(self):
return getUrlUnicode(self.externalreviewsUrl)
def parseExternalreviews(self):
soup = BeautifulSoup(self.getExternalreviews())
ol = soup('ol')
@ -471,12 +488,10 @@ class IMDb:
pass
return ret
return {}
def getTrailer(self, forcereload = False):
if forcereload or not self.trailerSource:
self.trailerSource = getUrlUnicode(self.trailerUrl)
return self.trailerSource
def getTrailer(self):
return getUrlUnicode(self.trailerUrl)
def parseTrailer(self):
ret = {}
soup = BeautifulSoup(self.getTrailer())
@ -519,8 +534,8 @@ def guess(title, director=''):
return None
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
if data:
imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if data:
imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id:
return imdb_id
@ -538,7 +553,7 @@ def guess(title, director=''):
def getEpisodeData(title, episode, show_url = None):
'''
Collect information about an episode.
Returns dict with title, show, description and episode
'''
episodeData = {
@ -559,13 +574,13 @@ def getEpisodeData(title, episode, show_url = None):
episodeData['imdb'] = i['episodes'][episode]['imdb']
return episodeData
def getMovieStills(id):
data = getUrl("http://imdb.com/gallery/ss/%s" % id)
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % id).findall(data)
def getMovieStills(imdbId):
data = getUrl("http://imdb.com/gallery/ss/%s" % imdbId)
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
stills = []
for s in s_:
if int(s[0]) > int(s[1]):
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (id, s[2]))
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
if not stills:
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
stills = []