more raw values from imdb
This commit is contained in:
parent
abf263af13
commit
8b58b4824b
1 changed files with 105 additions and 51 deletions
156
ox/imdb.py
156
ox/imdb.py
|
@ -43,32 +43,39 @@ def getRawMovieData(imdbId):
|
||||||
data['credits'] = getCredits(imdbId)
|
data['credits'] = getCredits(imdbId)
|
||||||
data['poster'] = getPoster(imdbId)
|
data['poster'] = getPoster(imdbId)
|
||||||
data['trailers'] = getMovieTrailers(imdbId)
|
data['trailers'] = getMovieTrailers(imdbId)
|
||||||
|
data['companyCredits'] = getMovieCompanyCredits(imdbId)
|
||||||
|
|
||||||
def parseBase(imdbId):
|
def parseBase(imdbId):
|
||||||
data = getUrl(getUrlBase(imdbId))
|
data = getUrl(getUrlBase(imdbId))
|
||||||
soup = BeautifulSoup(data)
|
soup = BeautifulSoup(data)
|
||||||
info = dict()
|
info = dict()
|
||||||
for i in soup('div', {'class':'info'}):
|
info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||||
|
|
||||||
title = i('h5')
|
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
|
||||||
if title:
|
title = stripTags(i[0]).strip().lower()
|
||||||
title=title[0]
|
txt= stripTags(i[1]).strip()
|
||||||
txt = title.findNext()
|
def cleanUp(k):
|
||||||
title = stripTags(unicode(title))
|
k = htmldecode(k).replace(u'\xa0', ' ').strip()
|
||||||
if title.endswith(':'):
|
if k.endswith('more'): k=k[:-len('more')].strip()
|
||||||
title = title[:-1]
|
return k
|
||||||
info[title] = htmldecode(stripTags(unicode(txt)))
|
txt = cleanUp(txt)
|
||||||
return info
|
if title not in ('plot', 'trivia', 'filming locations', 'mpaa'):
|
||||||
return soup
|
if '|' in txt:
|
||||||
|
txt = [cleanUp(k) for k in txt.split('|')]
|
||||||
|
elif ', ' in txt:
|
||||||
|
txt = [cleanUp(k) for k in txt.split(', ')]
|
||||||
|
if not title.startswith('moviemeter'):
|
||||||
|
info[title] = txt
|
||||||
|
for key in ('user comments', 'writers (wga)'):
|
||||||
|
if key in info:
|
||||||
|
del info[key]
|
||||||
|
if 'release date' in info:
|
||||||
|
info['release date'] = info['release date'].split('\n')[0]
|
||||||
|
if 'plot' in info:
|
||||||
|
info['plot'] = info['plot'].split('| add synopsis')[0].strip()
|
||||||
|
|
||||||
def getPoster(imdbId):
|
#get Title
|
||||||
data = getUrl(getUrlBase(imdbId))
|
|
||||||
return findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
|
||||||
|
|
||||||
def getTitle(imdbId):
|
|
||||||
title = ''
|
title = ''
|
||||||
data = getUrl(getUrlBase(imdbId))
|
|
||||||
soup = BeautifulSoup(data)
|
|
||||||
html_title = soup('div', {'id': 'tn15title'})
|
html_title = soup('div', {'id': 'tn15title'})
|
||||||
if not html_title:
|
if not html_title:
|
||||||
html_title = soup('title')
|
html_title = soup('title')
|
||||||
|
@ -83,10 +90,19 @@ def getTitle(imdbId):
|
||||||
title = title.replace(t, '')
|
title = title.replace(t, '')
|
||||||
title = title.strip()
|
title = title.strip()
|
||||||
if title.find(u'\xa0') > -1:
|
if title.find(u'\xa0') > -1:
|
||||||
title = title[:title.find(u'\xa0')]
|
title = title[:title.find(u'\xa0')].strip()
|
||||||
if title.startswith('"') and title.endswith('"'):
|
if title.startswith('"') and title.endswith('"'):
|
||||||
title = title[1:-1]
|
title = title[1:-1]
|
||||||
return title
|
info['title'] = title
|
||||||
|
return info
|
||||||
|
|
||||||
|
def getPoster(imdbId):
|
||||||
|
info = parseBase(imdbId)
|
||||||
|
return info['poster']
|
||||||
|
|
||||||
|
def getTitle(imdbId):
|
||||||
|
info = parseBase(imdbId)
|
||||||
|
return info['title']
|
||||||
|
|
||||||
def creditList(data, section=None):
|
def creditList(data, section=None):
|
||||||
if section == 'cast':
|
if section == 'cast':
|
||||||
|
@ -130,21 +146,75 @@ def getMovieTrailers(imdbId):
|
||||||
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
|
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
|
||||||
return trailers
|
return trailers
|
||||||
|
|
||||||
def getMovieStills(imdbId):
|
def getMovieQuotes(imdbId):
|
||||||
url = "http://www.imdb.com/gallery/ss/%s" % imdbId
|
url = "%s/quotes" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
|
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(data)
|
||||||
stills = []
|
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
|
||||||
for s in s_:
|
return quotes
|
||||||
if int(s[0]) > int(s[1]):
|
|
||||||
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
|
def getMovieTechnical(imdbId):
|
||||||
if not stills:
|
url = "%s/technical" % getUrlBase(imdbId)
|
||||||
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
|
data = getUrlUnicode(url)
|
||||||
stills = []
|
results = {}
|
||||||
for s in s_:
|
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
|
||||||
if int(s[0]) > int(s[1]):
|
results[t[0].strip()] = t[1].strip()
|
||||||
stills.append("http://%sf.jpg" % s[2])
|
return results
|
||||||
return stills
|
|
||||||
|
def getMovieCompanyCredits(imdbId):
|
||||||
|
url = "%s/companycredits" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
results = {}
|
||||||
|
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
|
||||||
|
results[field.strip()] = []
|
||||||
|
for company in re.compile('<li>(.*?)</li>').findall(c):
|
||||||
|
results[field.strip()].append(company)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def getMovieLocations(imdbId):
|
||||||
|
url = "%s/locations" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
locations = []
|
||||||
|
for key in soup('a', {'href': re.compile('^/List')}):
|
||||||
|
locations.append(htmldecode(key.string))
|
||||||
|
return locations
|
||||||
|
|
||||||
|
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
|
||||||
|
photos = {}
|
||||||
|
for key in keys:
|
||||||
|
url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
photos[key] = {}
|
||||||
|
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
|
||||||
|
img= "%s.jpg" % s[1].split('._V')[0]
|
||||||
|
title = s[0]
|
||||||
|
if key=='still_frame':
|
||||||
|
if not "_CR0" in s[1]:
|
||||||
|
photos[key][img] = title
|
||||||
|
else:
|
||||||
|
photos[key][img] = title
|
||||||
|
return photos
|
||||||
|
|
||||||
|
def getMovieStills(imdbId):
|
||||||
|
return getMovieImages(imdbId, ['still_frame'])['still_frame']
|
||||||
|
|
||||||
|
def getMoviePosters(imdbId):
|
||||||
|
return getMovieImages(imdbId, ['poster'])['poster']
|
||||||
|
|
||||||
|
def getMovieTrivia(imdbId):
|
||||||
|
url = "%s/trivia" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
trivia = []
|
||||||
|
triviaList = []
|
||||||
|
for i in soup('ul', {'class': "trivia"}):
|
||||||
|
for t in i('li'):
|
||||||
|
t = str(t).replace('<br />', '').strip()
|
||||||
|
if t.startswith('<li>') and t.endswith('</li>'):
|
||||||
|
t = t[4:-5].strip()
|
||||||
|
trivia.append(t)
|
||||||
|
return trivia
|
||||||
|
|
||||||
'''the old code below'''
|
'''the old code below'''
|
||||||
|
|
||||||
|
@ -160,7 +230,6 @@ class IMDb:
|
||||||
self.keywordUrl = "%skeywords" % self.pageUrl
|
self.keywordUrl = "%skeywords" % self.pageUrl
|
||||||
self.plotUrl = "%splotsummary" % self.pageUrl
|
self.plotUrl = "%splotsummary" % self.pageUrl
|
||||||
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
||||||
self.triviaUrl = "%strivia" % self.pageUrl
|
|
||||||
self.locationUrl = "%slocations" % self.pageUrl
|
self.locationUrl = "%slocations" % self.pageUrl
|
||||||
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
||||||
|
|
||||||
|
@ -306,7 +375,7 @@ class IMDb:
|
||||||
IMDbDict['plot'] = self.parsePlot()
|
IMDbDict['plot'] = self.parsePlot()
|
||||||
IMDbDict['keywords'] = self.parseKeywords()
|
IMDbDict['keywords'] = self.parseKeywords()
|
||||||
|
|
||||||
IMDbDict['trivia'] = self.parseTrivia()
|
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
||||||
IMDbDict['connections'] = self.parseConnections()
|
IMDbDict['connections'] = self.parseConnections()
|
||||||
IMDbDict['locations'] = self.parseLocations()
|
IMDbDict['locations'] = self.parseLocations()
|
||||||
IMDbDict['release_date'] = self.parseReleaseinfo()
|
IMDbDict['release_date'] = self.parseReleaseinfo()
|
||||||
|
@ -403,21 +472,6 @@ class IMDb:
|
||||||
self.keywords = keywords
|
self.keywords = keywords
|
||||||
return self.keywords
|
return self.keywords
|
||||||
|
|
||||||
def parseTrivia(self):
|
|
||||||
data = getUrlUnicode(self.triviaUrl)
|
|
||||||
soup = BeautifulSoup(data)
|
|
||||||
|
|
||||||
trivia = []
|
|
||||||
triviaList = []
|
|
||||||
for i in soup('ul', {'class': "trivia"}):
|
|
||||||
for t in i('li'):
|
|
||||||
t = str(t).replace('<br />', '').strip()
|
|
||||||
if t.startswith('<li>') and t.endswith('</li>'):
|
|
||||||
t = t[4:-5].strip()
|
|
||||||
trivia.append(t)
|
|
||||||
self.trivia = trivia
|
|
||||||
return self.trivia
|
|
||||||
|
|
||||||
def getConnections(self):
|
def getConnections(self):
|
||||||
return getUrlUnicode(self.connectionsUrl)
|
return getUrlUnicode(self.connectionsUrl)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue