- changes to imdb.py

* user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId
This commit is contained in:
j 2008-04-29 18:12:27 +02:00
parent 7a53ee62b9
commit 69adaeee00
3 changed files with 165 additions and 149 deletions

View file

@ -4,5 +4,7 @@
__version__ = '0.1.0' __version__ = '0.1.0'
from net import * import imdb
import wikipedia
import google

View file

@ -17,7 +17,6 @@ from oxutils import stripTags
usage: usage:
import google import google
google.find(query) google.find(query)
<generator object at 0x833aeac>
for result in google.find(query): result for result in google.find(query): result

View file

@ -12,19 +12,102 @@ import time
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import chardet import chardet
import oxutils import oxutils
from oxutils import stripTags, htmldecode from oxutils import stripTags, htmldecode, findRegexp
from oxutils.cache import getUrl, getUrlUnicode from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle from oxutils.normalize import normalizeTitle, normalizeImdbId
import google import google
def _get_data(url): def getMovieId(title, director='', year=''):
data = None if year:
try: title = "%s (%s)" % (title, year)
data = getUrl(url) if director:
except: query = 'site:imdb.com %s "%s"' % (director, title)
print "error reading data from", url else:
return data query = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(query, 3):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
def getMovieData(imdbId):
return IMDb(imdbId).parse()
# internal functions below
def getUrlBase(imdbId):
return "http://www.imdb.com/title/tt%s" % imdbId
def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId)
data = dict()
data['credits'] = parseCredits(imdbId)
data['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
def parseBase(imdbId):
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
info = dict()
for i in soup('div', {'class':'info'}):
title = i('h5')
if title:
title=title[0]
txt = title.findNext()
title = stripTags(unicode(title))
if title.endswith(':'):
title = title[:-1]
info[title] = htmldecode(stripTags(unicode(txt)))
return info
return soup
def getTitle(imdbId):
title = ''
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = htmldecode(html_title)
title = stripTags(title)
title = re.sub('\(\d\d\d\d\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')]
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
return title
def creditList(data, section=None):
if section == 'cast':
credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
else:
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
credits = []
for c_ in credits_:
c = [c_[0].strip(), c_[1].strip()]
if section=='writers':
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
if c[1].endswith(' and'): c[1] = c[1][:-4]
credits.append(c)
return credits
def parseCredits(imdbId):
credits = dict()
url = "%s/fullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
groups = data.split('<h5>')
for g in groups:
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
if section:
credits[section[0]] = creditList(g, section[0])
return credits
'''the old code below'''
def get_image(url): def get_image(url):
return getUrl(url) return getUrl(url)
@ -42,62 +125,36 @@ def _castList(data, regexp):
return names return names
return [] return []
def _getTerm(data, regexp):
term = ''
try:
reg = re.compile(regexp, re.IGNORECASE)
m = reg.search(data)
if m:
term = stripTags(m.group(1)).strip()
except:
print "waring, parsing failed for", regexp
return term.encode('utf8')
class IMDb: class IMDb:
def __init__(self, imdb): def __init__(self, imdbId):
self.imdb = imdb self.imdb = imdbId
self.pageSource = None
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
self.businessSource = None
self.businessUrl = "%sbusiness" % self.pageUrl self.businessUrl = "%sbusiness" % self.pageUrl
self.connectionsSource = None
self.connectionsUrl = "%smovieconnections" % self.pageUrl self.connectionsUrl = "%smovieconnections" % self.pageUrl
self.creditsSource = None
self.creditsUrl = "%sfullcredits" % self.pageUrl self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesSource = None
self.episodesUrl = "%sepisodes" % self.pageUrl self.episodesUrl = "%sepisodes" % self.pageUrl
self.keywordSource = None
self.keywordUrl = "%skeywords" % self.pageUrl self.keywordUrl = "%skeywords" % self.pageUrl
self.plotSource = None
self.plotUrl = "%splotsummary" % self.pageUrl self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoSource = None
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.triviaSource = None
self.triviaUrl = "%strivia" % self.pageUrl self.triviaUrl = "%strivia" % self.pageUrl
self.locationSource = None
self.locationUrl = "%slocations" % self.pageUrl self.locationUrl = "%slocations" % self.pageUrl
self.externalreviewsSource = None
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
self.trailerSource = None
self.trailerUrl = "%strailers" % self.pageUrl self.trailerUrl = "%strailers" % self.pageUrl
def getPage(self, forcereload = False): def getPage(self):
if forcereload or not self.pageSource: return getUrlUnicode(self.pageUrl)
self.pageSource = getUrlUnicode(self.pageUrl)
return self.pageSource
def parse_raw_value(self, key, value): def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
value = unicode(value, 'utf-8') value = unicode(value, 'utf-8')
value = stripTags(value).strip() value = stripTags(value).strip()
if key == 'runtime': if key == 'runtime':
parsed_value = _getTerm(value, '(.*?) min') parsed_value = findRegexp(value, '(.*?) min')
parsed_value = _getTerm(parsed_value, '([0-9]+)') parsed_value = findRegexp(parsed_value, '([0-9]+)')
if not parsed_value: if not parsed_value:
parsed_value = _getTerm(value, '(.*?) sec') parsed_value = findRegexp(value, '(.*?) sec')
parsed_value = _getTerm(parsed_value, '([0-9]+)') parsed_value = findRegexp(parsed_value, '([0-9]+)')
if not parsed_value: if not parsed_value:
parsed_value = 0 parsed_value = 0
else: else:
@ -141,7 +198,7 @@ class IMDb:
print value print value
parsed_value = value parsed_value = value
return parsed_value return parsed_value
def parseTitle(self): def parseTitle(self):
title = '' title = ''
data = self.getPage() data = self.getPage()
@ -153,8 +210,8 @@ class IMDb:
html_title = str(html_title[0]) html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ') html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = stripTags(html_title) title = stripTags(html_title)
title = re.sub('\(\d\d\d\d\)', '', title) title = re.sub('\(\d{4}\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title) title = re.sub('\(\d{4}/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '') title = title.replace(t, '')
if title.find(u'\xa0') > -1: if title.find(u'\xa0') > -1:
@ -172,7 +229,7 @@ class IMDb:
else: else:
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
return normalizeTitle(title) return normalizeTitle(title)
def parseYear(self): def parseYear(self):
year = '' year = ''
data = self.getPage() data = self.getPage()
@ -183,25 +240,25 @@ class IMDb:
if html_title: if html_title:
html_title = str(html_title[0]) html_title = str(html_title[0])
html_title = stripTags(html_title) html_title = stripTags(html_title)
year = re.compile('\((\d\d\d\d)\)').findall(html_title) year = re.compile('\((\d{4})\)').findall(html_title)
if not year: if not year:
year = re.compile('\((\d\d\d\d)/').findall(html_title) year = re.compile('\((\d{4})/').findall(html_title)
if year: if year:
year = year[0] year = year[0]
else: year = '' else: year = ''
return year return year
def parse(self): def parse(self):
data = self.getPage() data = self.getPage()
IMDbDict ={} IMDbDict ={}
#Poster #Poster
IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"') IMDbDict['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
if not IMDbDict['poster']: if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
#Title, Year #Title, Year
IMDbDict['year'] = self.parseYear() IMDbDict['year'] = self.parseYear()
IMDbDict['title'] = self.parseTitle() IMDbDict['title'] = self.parseTitle()
#Rating #Rating
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data) m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
if m: if m:
@ -251,49 +308,32 @@ class IMDb:
IMDbDict['stills'] = getMovieStills(self.imdb) IMDbDict['stills'] = getMovieStills(self.imdb)
#IMDbDict['trailer'] = self.parseTrailer() #IMDbDict['trailer'] = self.parseTrailer()
self.IMDbDict = IMDbDict self.IMDbDict = IMDbDict
if IMDbDict['episode_of']: if IMDbDict['episode_of']:
episode_of =IMDb(IMDbDict['episode_of']).parse() episode_of =IMDb(IMDbDict['episode_of']).parse()
for key in ('country', 'language'): for key in ('country', 'language'):
if not IMDbDict[key]: if not IMDbDict[key]:
IMDbDict[key] = episode_of[key] IMDbDict[key] = episode_of[key]
return self.IMDbDict return self.IMDbDict
def getCredits(self, forcereload = False):
if forcereload or not self.creditsSource:
self.creditsSource = getUrlUnicode(self.creditsUrl)
return self.creditsSource
def parseCredits(self): def parseCredits(self):
data = self.getCredits() raw_credits = parseCredits(self.imdb)
credits = {} credits = {}
credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>') def getNames(creditList):
credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>') return [stripTags(c[0]) for c in creditList]
#credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
credits['cast'] = [] credits['director'] = getNames(raw_credits['directors'])
soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data) credits['writer'] = getNames(raw_credits['writers'])
soup = BeautifulSoup(data) credits['producer'] = getNames(raw_credits['producers'])
cast = soup('table', {'class': 'cast'}) credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']]
if cast:
cast = str(cast[0]).replace(u'\xa0', ' ')
names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
for name in names:
real_name = name[0]
role_name = name[1]
if role_name:
role_name = role_name.split('(')[0].replace('/ ...','')
credits['cast'].append((stripTags(real_name), stripTags(role_name)))
self.credits = credits self.credits = credits
return self.credits return self.credits
def getPlot(self, forcereload = False):
if forcereload or not self.plotSource:
self.plotSource = getUrlUnicode(self.plotUrl)
return self.plotSource
def parsePlot(self): def parsePlot(self):
soup = BeautifulSoup(self.getPlot()) data = getUrlUnicode(self.plotUrl)
soup = BeautifulSoup(data)
plot = soup('p', {'class':'plotpar'}) plot = soup('p', {'class':'plotpar'})
if plot: if plot:
plot = unicode(plot[0]).split('<i>')[0] plot = unicode(plot[0]).split('<i>')[0]
@ -302,15 +342,11 @@ class IMDb:
plot = stripTags(plot).strip() plot = stripTags(plot).strip()
self.plot = plot self.plot = plot
return plot return plot
def getEpisodes(self, forcereload = False):
if forcereload or not self.episodesSource:
self.episodesSource = getUrlUnicode(self.episodesUrl)
return self.episodesSource
def parseEpisodes(self): def parseEpisodes(self):
episodes = {} episodes = {}
cdata = self.getEpisodes().replace('\r\n', ' ') data = getUrlUnicode(self.episodesUrl)
cdata = data.replace('\r\n', ' ')
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>''' regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
reg = re.compile(regexp, re.IGNORECASE) reg = re.compile(regexp, re.IGNORECASE)
m = reg.findall(cdata) m = reg.findall(cdata)
@ -340,26 +376,18 @@ class IMDb:
self.episodes = episodes self.episodes = episodes
return self.episodes return self.episodes
def getLocations(self, forcereload = False):
if forcereload or not self.locationSource:
self.keywordSource = getUrlUnicode(self.locationUrl)
return self.keywordSource
def parseLocations(self): def parseLocations(self):
soup = BeautifulSoup(self.getLocations()) data = getUrlUnicode(self.locationUrl)
soup = BeautifulSoup(data)
locations = [] locations = []
for key in soup('a', {'href': re.compile('^/List')}): for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string)) locations.append(htmldecode(key.string))
self.locations = locations self.locations = locations
return self.locations return self.locations
def getKeywords(self, forcereload = False):
if forcereload or not self.keywordSource:
self.keywordSource = getUrlUnicode(self.keywordUrl)
return self.keywordSource
def parseKeywords(self): def parseKeywords(self):
soup = BeautifulSoup(self.getKeywords()) data = getUrlUnicode(self.keywordUrl)
soup = BeautifulSoup(data)
keywords = [] keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}): for key in soup('a', {'href': re.compile('^/keyword/')}):
k = htmldecode(key.string) k = htmldecode(key.string)
@ -368,28 +396,23 @@ class IMDb:
self.keywords = keywords self.keywords = keywords
return self.keywords return self.keywords
def getTrivia(self, forcereload = False):
if forcereload or not self.triviaSource:
self.triviaSource = getUrlUnicode(self.triviaUrl)
return self.triviaSource
def parseTrivia(self): def parseTrivia(self):
data = getUrlUnicode(self.triviaUrl)
soup = BeautifulSoup(data)
trivia = [] trivia = []
soup = BeautifulSoup(self.getTrivia())
triviaList = [] triviaList = []
for i in soup('ul', {'class': "trivia"}): for i in soup('ul', {'class': "trivia"}):
for t in i('li'): for t in i('li'):
t = str(t).replace('<br />', '').strip() t = str(t).replace('<br />', '').strip()
if t.startswith('<li>') and t.endswith('</li>'): if t.startswith('<li>') and t.endswith('</li>'):
t = t[4:-5].strip() t = t[4:-5].strip()
trivia.append(t) trivia.append(t)
self.trivia = trivia self.trivia = trivia
return self.trivia return self.trivia
def getConnections(self, forcereload = False): def getConnections(self):
if forcereload or not self.connectionsSource: return getUrlUnicode(self.connectionsUrl)
self.connectionsSource = getUrlUnicode(self.connectionsUrl)
return self.connectionsSource
def parseConnections(self): def parseConnections(self):
connections = {} connections = {}
@ -404,10 +427,8 @@ class IMDb:
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
return connections return connections
def getReleaseinfo(self, forcereload = False): def getReleaseinfo(self):
if forcereload or not self.releaseinfoSource: return getUrlUnicode(self.releaseinfoUrl)
self.releaseinfoSource = getUrlUnicode(self.releaseinfoUrl)
return self.releaseinfoSource
def parseReleaseinfo(self): def parseReleaseinfo(self):
soup = BeautifulSoup(self.getReleaseinfo()) soup = BeautifulSoup(self.getReleaseinfo())
@ -424,12 +445,10 @@ class IMDb:
except: except:
pass pass
return None return None
def getBusiness(self, forcereload = False): def getBusiness(self):
if forcereload or not self.businessSource: return getUrlUnicode(self.businessUrl)
self.businessSource = getUrlUnicode(self.businessUrl)
return self.businessSource
def parseBusiness(self): def parseBusiness(self):
soup = BeautifulSoup(self.getBusiness()) soup = BeautifulSoup(self.getBusiness())
business = {'budget': 0, 'gross': 0, 'profit': 0} business = {'budget': 0, 'gross': 0, 'profit': 0}
@ -449,12 +468,10 @@ class IMDb:
if business['budget'] and business['gross']: if business['budget'] and business['gross']:
business['profit'] = business['gross'] - business['budget'] business['profit'] = business['gross'] - business['budget']
return business return business
def getExternalreviews(self, forcereload = False): def getExternalreviews(self):
if forcereload or not self.externalreviewsSource: return getUrlUnicode(self.externalreviewsUrl)
self.externalreviewsSource = getUrlUnicode(self.externalreviewsUrl)
return self.externalreviewsSource
def parseExternalreviews(self): def parseExternalreviews(self):
soup = BeautifulSoup(self.getExternalreviews()) soup = BeautifulSoup(self.getExternalreviews())
ol = soup('ol') ol = soup('ol')
@ -471,12 +488,10 @@ class IMDb:
pass pass
return ret return ret
return {} return {}
def getTrailer(self, forcereload = False): def getTrailer(self):
if forcereload or not self.trailerSource: return getUrlUnicode(self.trailerUrl)
self.trailerSource = getUrlUnicode(self.trailerUrl)
return self.trailerSource
def parseTrailer(self): def parseTrailer(self):
ret = {} ret = {}
soup = BeautifulSoup(self.getTrailer()) soup = BeautifulSoup(self.getTrailer())
@ -519,8 +534,8 @@ def guess(title, director=''):
return None return None
if return_url.startswith('http://www.imdb.com/title/tt'): if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35] return return_url[28:35]
if data: if data:
imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)') imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id: if imdb_id:
return imdb_id return imdb_id
@ -538,7 +553,7 @@ def guess(title, director=''):
def getEpisodeData(title, episode, show_url = None): def getEpisodeData(title, episode, show_url = None):
''' '''
Collect information about an episode. Collect information about an episode.
Returns dict with title, show, description and episode Returns dict with title, show, description and episode
''' '''
episodeData = { episodeData = {
@ -559,13 +574,13 @@ def getEpisodeData(title, episode, show_url = None):
episodeData['imdb'] = i['episodes'][episode]['imdb'] episodeData['imdb'] = i['episodes'][episode]['imdb']
return episodeData return episodeData
def getMovieStills(id): def getMovieStills(imdbId):
data = getUrl("http://imdb.com/gallery/ss/%s" % id) data = getUrl("http://imdb.com/gallery/ss/%s" % imdbId)
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % id).findall(data) s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
stills = [] stills = []
for s in s_: for s in s_:
if int(s[0]) > int(s[1]): if int(s[0]) > int(s[1]):
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (id, s[2])) stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
if not stills: if not stills:
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data) s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
stills = [] stills = []