more raw values from imdb

This commit is contained in:
j 2008-04-30 00:15:28 +02:00
parent abf263af13
commit 8b58b4824b
1 changed files with 105 additions and 51 deletions

View File

@ -43,32 +43,39 @@ def getRawMovieData(imdbId):
data['credits'] = getCredits(imdbId)
data['poster'] = getPoster(imdbId)
data['trailers'] = getMovieTrailers(imdbId)
data['companyCredits'] = getMovieCompanyCredits(imdbId)
def parseBase(imdbId):
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
info = dict()
for i in soup('div', {'class':'info'}):
info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
title = i('h5')
if title:
title=title[0]
txt = title.findNext()
title = stripTags(unicode(title))
if title.endswith(':'):
title = title[:-1]
info[title] = htmldecode(stripTags(unicode(txt)))
return info
return soup
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
title = stripTags(i[0]).strip().lower()
txt= stripTags(i[1]).strip()
def cleanUp(k):
k = htmldecode(k).replace(u'\xa0', ' ').strip()
if k.endswith('more'): k=k[:-len('more')].strip()
return k
txt = cleanUp(txt)
if title not in ('plot', 'trivia', 'filming locations', 'mpaa'):
if '|' in txt:
txt = [cleanUp(k) for k in txt.split('|')]
elif ', ' in txt:
txt = [cleanUp(k) for k in txt.split(', ')]
if not title.startswith('moviemeter'):
info[title] = txt
for key in ('user comments', 'writers (wga)'):
if key in info:
del info[key]
if 'release date' in info:
info['release date'] = info['release date'].split('\n')[0]
if 'plot' in info:
info['plot'] = info['plot'].split('| add synopsis')[0].strip()
def getPoster(imdbId):
data = getUrl(getUrlBase(imdbId))
return findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
def getTitle(imdbId):
#get Title
title = ''
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
@ -83,10 +90,19 @@ def getTitle(imdbId):
title = title.replace(t, '')
title = title.strip()
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')]
title = title[:title.find(u'\xa0')].strip()
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
return title
info['title'] = title
return info
def getPoster(imdbId):
info = parseBase(imdbId)
return info['poster']
def getTitle(imdbId):
info = parseBase(imdbId)
return info['title']
def creditList(data, section=None):
if section == 'cast':
@ -130,21 +146,75 @@ def getMovieTrailers(imdbId):
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers
def getMovieStills(imdbId):
url = "http://www.imdb.com/gallery/ss/%s" % imdbId
def getMovieQuotes(imdbId):
url = "%s/quotes" % getUrlBase(imdbId)
data = getUrlUnicode(url)
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
stills = []
for s in s_:
if int(s[0]) > int(s[1]):
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
if not stills:
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
stills = []
for s in s_:
if int(s[0]) > int(s[1]):
stills.append("http://%sf.jpg" % s[2])
return stills
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(data)
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
return quotes
def getMovieTechnical(imdbId):
url = "%s/technical" % getUrlBase(imdbId)
data = getUrlUnicode(url)
results = {}
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
results[t[0].strip()] = t[1].strip()
return results
def getMovieCompanyCredits(imdbId):
url = "%s/companycredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
results = {}
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
results[field.strip()] = []
for company in re.compile('<li>(.*?)</li>').findall(c):
results[field.strip()].append(company)
return results
def getMovieLocations(imdbId):
url = "%s/locations" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string))
return locations
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
photos = {}
for key in keys:
url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
data = getUrlUnicode(url)
photos[key] = {}
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
img= "%s.jpg" % s[1].split('._V')[0]
title = s[0]
if key=='still_frame':
if not "_CR0" in s[1]:
photos[key][img] = title
else:
photos[key][img] = title
return photos
def getMovieStills(imdbId):
return getMovieImages(imdbId, ['still_frame'])['still_frame']
def getMoviePosters(imdbId):
return getMovieImages(imdbId, ['poster'])['poster']
def getMovieTrivia(imdbId):
url = "%s/trivia" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
trivia = []
triviaList = []
for i in soup('ul', {'class': "trivia"}):
for t in i('li'):
t = str(t).replace('<br />', '').strip()
if t.startswith('<li>') and t.endswith('</li>'):
t = t[4:-5].strip()
trivia.append(t)
return trivia
'''the old code below'''
@ -160,7 +230,6 @@ class IMDb:
self.keywordUrl = "%skeywords" % self.pageUrl
self.plotUrl = "%splotsummary" % self.pageUrl
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.triviaUrl = "%strivia" % self.pageUrl
self.locationUrl = "%slocations" % self.pageUrl
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
@ -306,7 +375,7 @@ class IMDb:
IMDbDict['plot'] = self.parsePlot()
IMDbDict['keywords'] = self.parseKeywords()
IMDbDict['trivia'] = self.parseTrivia()
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
IMDbDict['connections'] = self.parseConnections()
IMDbDict['locations'] = self.parseLocations()
IMDbDict['release_date'] = self.parseReleaseinfo()
@ -403,21 +472,6 @@ class IMDb:
self.keywords = keywords
return self.keywords
def parseTrivia(self):
data = getUrlUnicode(self.triviaUrl)
soup = BeautifulSoup(data)
trivia = []
triviaList = []
for i in soup('ul', {'class': "trivia"}):
for t in i('li'):
t = str(t).replace('<br />', '').strip()
if t.startswith('<li>') and t.endswith('</li>'):
t = t[4:-5].strip()
trivia.append(t)
self.trivia = trivia
return self.trivia
def getConnections(self):
return getUrlUnicode(self.connectionsUrl)