- changes to imdb.py

* user more oxutils functions
  * start migrating to a raw dict, first part, parse full cast with names from imdb
  * add getMovieId
This commit is contained in:
j 2008-04-29 18:12:27 +02:00
parent 7a53ee62b9
commit 69adaeee00
3 changed files with 165 additions and 149 deletions

View file

@ -4,5 +4,7 @@
__version__ = '0.1.0' __version__ = '0.1.0'
from net import * import imdb
import wikipedia
import google

View file

@ -17,7 +17,6 @@ from oxutils import stripTags
usage: usage:
import google import google
google.find(query) google.find(query)
<generator object at 0x833aeac>
for result in google.find(query): result for result in google.find(query): result

View file

@ -12,19 +12,102 @@ import time
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import chardet import chardet
import oxutils import oxutils
from oxutils import stripTags, htmldecode from oxutils import stripTags, htmldecode, findRegexp
from oxutils.cache import getUrl, getUrlUnicode from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle from oxutils.normalize import normalizeTitle, normalizeImdbId
import google import google
def _get_data(url): def getMovieId(title, director='', year=''):
data = None if year:
try: title = "%s (%s)" % (title, year)
data = getUrl(url) if director:
except: query = 'site:imdb.com %s "%s"' % (director, title)
print "error reading data from", url else:
return data query = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(query, 3):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
def getMovieData(imdbId):
return IMDb(imdbId).parse()
# internal functions below
def getUrlBase(imdbId):
return "http://www.imdb.com/title/tt%s" % imdbId
def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId)
data = dict()
data['credits'] = parseCredits(imdbId)
data['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
def parseBase(imdbId):
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
info = dict()
for i in soup('div', {'class':'info'}):
title = i('h5')
if title:
title=title[0]
txt = title.findNext()
title = stripTags(unicode(title))
if title.endswith(':'):
title = title[:-1]
info[title] = htmldecode(stripTags(unicode(txt)))
return info
return soup
def getTitle(imdbId):
title = ''
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = htmldecode(html_title)
title = stripTags(title)
title = re.sub('\(\d\d\d\d\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')]
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
return title
def creditList(data, section=None):
if section == 'cast':
credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
else:
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
credits = []
for c_ in credits_:
c = [c_[0].strip(), c_[1].strip()]
if section=='writers':
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
if c[1].endswith(' and'): c[1] = c[1][:-4]
credits.append(c)
return credits
def parseCredits(imdbId):
credits = dict()
url = "%s/fullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
groups = data.split('<h5>')
for g in groups:
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
if section:
credits[section[0]] = creditList(g, section[0])
return credits
'''the old code below'''
def get_image(url): def get_image(url):
return getUrl(url) return getUrl(url)
@ -42,62 +125,36 @@ def _castList(data, regexp):
return names return names
return [] return []
def _getTerm(data, regexp):
term = ''
try:
reg = re.compile(regexp, re.IGNORECASE)
m = reg.search(data)
if m:
term = stripTags(m.group(1)).strip()
except:
print "waring, parsing failed for", regexp
return term.encode('utf8')
class IMDb: class IMDb:
def __init__(self, imdb): def __init__(self, imdbId):
self.imdb = imdb self.imdb = imdbId
self.pageSource = None
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
self.businessSource = None
self.businessUrl = "%sbusiness" % self.pageUrl self.businessUrl = "%sbusiness" % self.pageUrl
self.connectionsSource = None
self.connectionsUrl = "%smovieconnections" % self.pageUrl self.connectionsUrl = "%smovieconnections" % self.pageUrl
self.creditsSource = None
self.creditsUrl = "%sfullcredits" % self.pageUrl self.creditsUrl = "%sfullcredits" % self.pageUrl
self.episodesSource = None
self.episodesUrl = "%sepisodes" % self.pageUrl self.episodesUrl = "%sepisodes" % self.pageUrl
self.keywordSource = None
self.keywordUrl = "%skeywords" % self.pageUrl self.keywordUrl = "%skeywords" % self.pageUrl
self.plotSource = None
self.plotUrl = "%splotsummary" % self.pageUrl self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoSource = None
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.triviaSource = None
self.triviaUrl = "%strivia" % self.pageUrl self.triviaUrl = "%strivia" % self.pageUrl
self.locationSource = None
self.locationUrl = "%slocations" % self.pageUrl self.locationUrl = "%slocations" % self.pageUrl
self.externalreviewsSource = None
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
self.trailerSource = None
self.trailerUrl = "%strailers" % self.pageUrl self.trailerUrl = "%strailers" % self.pageUrl
def getPage(self, forcereload = False): def getPage(self):
if forcereload or not self.pageSource: return getUrlUnicode(self.pageUrl)
self.pageSource = getUrlUnicode(self.pageUrl)
return self.pageSource
def parse_raw_value(self, key, value): def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
value = unicode(value, 'utf-8') value = unicode(value, 'utf-8')
value = stripTags(value).strip() value = stripTags(value).strip()
if key == 'runtime': if key == 'runtime':
parsed_value = _getTerm(value, '(.*?) min') parsed_value = findRegexp(value, '(.*?) min')
parsed_value = _getTerm(parsed_value, '([0-9]+)') parsed_value = findRegexp(parsed_value, '([0-9]+)')
if not parsed_value: if not parsed_value:
parsed_value = _getTerm(value, '(.*?) sec') parsed_value = findRegexp(value, '(.*?) sec')
parsed_value = _getTerm(parsed_value, '([0-9]+)') parsed_value = findRegexp(parsed_value, '([0-9]+)')
if not parsed_value: if not parsed_value:
parsed_value = 0 parsed_value = 0
else: else:
@ -153,8 +210,8 @@ class IMDb:
html_title = str(html_title[0]) html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ') html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = stripTags(html_title) title = stripTags(html_title)
title = re.sub('\(\d\d\d\d\)', '', title) title = re.sub('\(\d{4}\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title) title = re.sub('\(\d{4}/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '') title = title.replace(t, '')
if title.find(u'\xa0') > -1: if title.find(u'\xa0') > -1:
@ -183,9 +240,9 @@ class IMDb:
if html_title: if html_title:
html_title = str(html_title[0]) html_title = str(html_title[0])
html_title = stripTags(html_title) html_title = stripTags(html_title)
year = re.compile('\((\d\d\d\d)\)').findall(html_title) year = re.compile('\((\d{4})\)').findall(html_title)
if not year: if not year:
year = re.compile('\((\d\d\d\d)/').findall(html_title) year = re.compile('\((\d{4})/').findall(html_title)
if year: if year:
year = year[0] year = year[0]
else: year = '' else: year = ''
@ -195,7 +252,7 @@ class IMDb:
data = self.getPage() data = self.getPage()
IMDbDict ={} IMDbDict ={}
#Poster #Poster
IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"') IMDbDict['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
if not IMDbDict['poster']: if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
#Title, Year #Title, Year
@ -259,41 +316,24 @@ class IMDb:
IMDbDict[key] = episode_of[key] IMDbDict[key] = episode_of[key]
return self.IMDbDict return self.IMDbDict
def getCredits(self, forcereload = False):
if forcereload or not self.creditsSource:
self.creditsSource = getUrlUnicode(self.creditsUrl)
return self.creditsSource
def parseCredits(self): def parseCredits(self):
data = self.getCredits() raw_credits = parseCredits(self.imdb)
credits = {} credits = {}
credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>') def getNames(creditList):
credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>') return [stripTags(c[0]) for c in creditList]
#credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
credits['cast'] = [] credits['director'] = getNames(raw_credits['directors'])
soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data) credits['writer'] = getNames(raw_credits['writers'])
soup = BeautifulSoup(data) credits['producer'] = getNames(raw_credits['producers'])
cast = soup('table', {'class': 'cast'}) credits['cast'] = [(stripTags(c[0]),stripTags(c[1])) for c in raw_credits['cast']]
if cast:
cast = str(cast[0]).replace(u'\xa0', ' ')
names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
for name in names:
real_name = name[0]
role_name = name[1]
if role_name:
role_name = role_name.split('(')[0].replace('/ ...','')
credits['cast'].append((stripTags(real_name), stripTags(role_name)))
self.credits = credits self.credits = credits
return self.credits return self.credits
def getPlot(self, forcereload = False):
if forcereload or not self.plotSource:
self.plotSource = getUrlUnicode(self.plotUrl)
return self.plotSource
def parsePlot(self): def parsePlot(self):
soup = BeautifulSoup(self.getPlot()) data = getUrlUnicode(self.plotUrl)
soup = BeautifulSoup(data)
plot = soup('p', {'class':'plotpar'}) plot = soup('p', {'class':'plotpar'})
if plot: if plot:
plot = unicode(plot[0]).split('<i>')[0] plot = unicode(plot[0]).split('<i>')[0]
@ -303,14 +343,10 @@ class IMDb:
self.plot = plot self.plot = plot
return plot return plot
def getEpisodes(self, forcereload = False):
if forcereload or not self.episodesSource:
self.episodesSource = getUrlUnicode(self.episodesUrl)
return self.episodesSource
def parseEpisodes(self): def parseEpisodes(self):
episodes = {} episodes = {}
cdata = self.getEpisodes().replace('\r\n', ' ') data = getUrlUnicode(self.episodesUrl)
cdata = data.replace('\r\n', ' ')
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>''' regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
reg = re.compile(regexp, re.IGNORECASE) reg = re.compile(regexp, re.IGNORECASE)
m = reg.findall(cdata) m = reg.findall(cdata)
@ -340,26 +376,18 @@ class IMDb:
self.episodes = episodes self.episodes = episodes
return self.episodes return self.episodes
def getLocations(self, forcereload = False):
if forcereload or not self.locationSource:
self.keywordSource = getUrlUnicode(self.locationUrl)
return self.keywordSource
def parseLocations(self): def parseLocations(self):
soup = BeautifulSoup(self.getLocations()) data = getUrlUnicode(self.locationUrl)
soup = BeautifulSoup(data)
locations = [] locations = []
for key in soup('a', {'href': re.compile('^/List')}): for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string)) locations.append(htmldecode(key.string))
self.locations = locations self.locations = locations
return self.locations return self.locations
def getKeywords(self, forcereload = False):
if forcereload or not self.keywordSource:
self.keywordSource = getUrlUnicode(self.keywordUrl)
return self.keywordSource
def parseKeywords(self): def parseKeywords(self):
soup = BeautifulSoup(self.getKeywords()) data = getUrlUnicode(self.keywordUrl)
soup = BeautifulSoup(data)
keywords = [] keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}): for key in soup('a', {'href': re.compile('^/keyword/')}):
k = htmldecode(key.string) k = htmldecode(key.string)
@ -368,14 +396,11 @@ class IMDb:
self.keywords = keywords self.keywords = keywords
return self.keywords return self.keywords
def getTrivia(self, forcereload = False):
if forcereload or not self.triviaSource:
self.triviaSource = getUrlUnicode(self.triviaUrl)
return self.triviaSource
def parseTrivia(self): def parseTrivia(self):
data = getUrlUnicode(self.triviaUrl)
soup = BeautifulSoup(data)
trivia = [] trivia = []
soup = BeautifulSoup(self.getTrivia())
triviaList = [] triviaList = []
for i in soup('ul', {'class': "trivia"}): for i in soup('ul', {'class': "trivia"}):
for t in i('li'): for t in i('li'):
@ -386,10 +411,8 @@ class IMDb:
self.trivia = trivia self.trivia = trivia
return self.trivia return self.trivia
def getConnections(self, forcereload = False): def getConnections(self):
if forcereload or not self.connectionsSource: return getUrlUnicode(self.connectionsUrl)
self.connectionsSource = getUrlUnicode(self.connectionsUrl)
return self.connectionsSource
def parseConnections(self): def parseConnections(self):
connections = {} connections = {}
@ -404,10 +427,8 @@ class IMDb:
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
return connections return connections
def getReleaseinfo(self, forcereload = False): def getReleaseinfo(self):
if forcereload or not self.releaseinfoSource: return getUrlUnicode(self.releaseinfoUrl)
self.releaseinfoSource = getUrlUnicode(self.releaseinfoUrl)
return self.releaseinfoSource
def parseReleaseinfo(self): def parseReleaseinfo(self):
soup = BeautifulSoup(self.getReleaseinfo()) soup = BeautifulSoup(self.getReleaseinfo())
@ -425,10 +446,8 @@ class IMDb:
pass pass
return None return None
def getBusiness(self, forcereload = False): def getBusiness(self):
if forcereload or not self.businessSource: return getUrlUnicode(self.businessUrl)
self.businessSource = getUrlUnicode(self.businessUrl)
return self.businessSource
def parseBusiness(self): def parseBusiness(self):
soup = BeautifulSoup(self.getBusiness()) soup = BeautifulSoup(self.getBusiness())
@ -450,10 +469,8 @@ class IMDb:
business['profit'] = business['gross'] - business['budget'] business['profit'] = business['gross'] - business['budget']
return business return business
def getExternalreviews(self, forcereload = False): def getExternalreviews(self):
if forcereload or not self.externalreviewsSource: return getUrlUnicode(self.externalreviewsUrl)
self.externalreviewsSource = getUrlUnicode(self.externalreviewsUrl)
return self.externalreviewsSource
def parseExternalreviews(self): def parseExternalreviews(self):
soup = BeautifulSoup(self.getExternalreviews()) soup = BeautifulSoup(self.getExternalreviews())
@ -472,10 +489,8 @@ class IMDb:
return ret return ret
return {} return {}
def getTrailer(self, forcereload = False): def getTrailer(self):
if forcereload or not self.trailerSource: return getUrlUnicode(self.trailerUrl)
self.trailerSource = getUrlUnicode(self.trailerUrl)
return self.trailerSource
def parseTrailer(self): def parseTrailer(self):
ret = {} ret = {}
@ -520,7 +535,7 @@ def guess(title, director=''):
if return_url.startswith('http://www.imdb.com/title/tt'): if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35] return return_url[28:35]
if data: if data:
imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)') imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id: if imdb_id:
return imdb_id return imdb_id
@ -559,13 +574,13 @@ def getEpisodeData(title, episode, show_url = None):
episodeData['imdb'] = i['episodes'][episode]['imdb'] episodeData['imdb'] = i['episodes'][episode]['imdb']
return episodeData return episodeData
def getMovieStills(id): def getMovieStills(imdbId):
data = getUrl("http://imdb.com/gallery/ss/%s" % id) data = getUrl("http://imdb.com/gallery/ss/%s" % imdbId)
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % id).findall(data) s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
stills = [] stills = []
for s in s_: for s in s_:
if int(s[0]) > int(s[1]): if int(s[0]) > int(s[1]):
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (id, s[2])) stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
if not stills: if not stills:
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data) s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
stills = [] stills = []