get Trailers

This commit is contained in:
j 2008-04-29 21:09:10 +02:00
parent 69adaeee00
commit abf263af13
1 changed files with 61 additions and 85 deletions

View File

@ -4,7 +4,7 @@
from oxutils import *
import urllib2
from urllib import quote
from urllib import quote, unquote
import re, time
import os
import time
@ -39,8 +39,10 @@ def getUrlBase(imdbId):
def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId)
data = dict()
data['credits'] = parseCredits(imdbId)
data['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
data['title'] = getTitle(imdbId)
data['credits'] = getCredits(imdbId)
data['poster'] = getPoster(imdbId)
data['trailers'] = getMovieTrailers(imdbId)
def parseBase(imdbId):
data = getUrl(getUrlBase(imdbId))
@ -59,6 +61,10 @@ def parseBase(imdbId):
return info
return soup
def getPoster(imdbId):
data = getUrl(getUrlBase(imdbId))
return findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
def getTitle(imdbId):
title = ''
data = getUrl(getUrlBase(imdbId))
@ -96,7 +102,7 @@ def creditList(data, section=None):
credits.append(c)
return credits
def parseCredits(imdbId):
def getCredits(imdbId):
credits = dict()
url = "%s/fullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
@ -107,24 +113,41 @@ def parseCredits(imdbId):
credits[section[0]] = creditList(g, section[0])
return credits
def getMovieTrailers(imdbId):
url = "%s/trailers" % getUrlBase(imdbId)
data = getUrlUnicode(url)
soup = BeautifulSoup(data)
videos = soup('div', {'class':"video-gallery"})
trailers = []
if videos:
for a in videos[0]('a'):
title = stripTags(unicode(a)).strip()
url = 'http://www.imdb.com' + a['href']
videoId = findRegexp(url, '/(vi\d*?)/')
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
iframe = getUrlUnicode(iframeUrl)
videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers
def getMovieStills(imdbId):
url = "http://www.imdb.com/gallery/ss/%s" % imdbId
data = getUrlUnicode(url)
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
stills = []
for s in s_:
if int(s[0]) > int(s[1]):
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
if not stills:
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
stills = []
for s in s_:
if int(s[0]) > int(s[1]):
stills.append("http://%sf.jpg" % s[2])
return stills
'''the old code below'''
def get_image(url):
return getUrl(url)
def _castList(data, regexp):
soup = re.compile(regexp).findall(data)
if soup:
soup = BeautifulSoup(soup[0])
names = []
for i in soup('a', {'href': re.compile('/name/nm')}):
if i.string:
cast = stripTags(i.string)
if cast not in names:
names.append(cast)
return names
return []
class IMDb:
def __init__(self, imdbId):
self.imdb = imdbId
@ -140,7 +163,6 @@ class IMDb:
self.triviaUrl = "%strivia" % self.pageUrl
self.locationUrl = "%slocations" % self.pageUrl
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
self.trailerUrl = "%strailers" % self.pageUrl
def getPage(self):
return getUrlUnicode(self.pageUrl)
@ -163,8 +185,10 @@ class IMDb:
parsed_value = int(parsed_value) * 60
elif key in ('country', 'language'):
parsed_value = value.split(' / ')
parsed_value = [v.strip() for v in parsed_value]
elif key == 'genre':
parsed_value = value.replace('more', '').strip().split(' / ')
parsed_value = [v.strip() for v in parsed_value]
elif key == 'tagline':
parsed_value = value.replace('more', '').strip()
elif key == 'plot_outline':
@ -187,7 +211,7 @@ class IMDb:
if m:
parsed_value = m[0]
parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
director = self.parseCredits().get('director', None)
director = self.getCredits().get('director', None)
if director:
director = director[0]
parsed_value = parsed_value.replace(director, '')
@ -200,34 +224,17 @@ class IMDb:
return parsed_value
def parseTitle(self):
title = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = stripTags(html_title)
title = re.sub('\(\d{4}\)', '', title)
title = re.sub('\(\d{4}/I*\)', '', title)
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')]
title = normalizeTitle(title.strip())
if title.startswith('"') and title.endswith('"'):
title = normalizeTitle(title[1:-1])
elif title.startswith('"') and title.find('"',1) > 0 and \
title.find('"',1) == title.rfind('"'):
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
if se:
se = se[0]
se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
else:
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
title = getTitle(self.imdb)
title = normalizeTitle(title)
if title.startswith('"') and title.find('"',1) > 0 and \
title.find('"',1) == title.rfind('"'):
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
if se:
se = se[0]
se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
else:
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
return normalizeTitle(title)
def parseYear(self):
@ -252,7 +259,7 @@ class IMDb:
data = self.getPage()
IMDbDict ={}
#Poster
IMDbDict['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
IMDbDict['poster'] = getPoster(self.imdb)
if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
#Title, Year
@ -295,7 +302,7 @@ class IMDb:
IMDbDict['tvshow'] = True
else:
IMDbDict['tvshow'] = False
IMDbDict['credits'] = self.parseCredits()
IMDbDict['credits'] = self.getCredits()
IMDbDict['plot'] = self.parsePlot()
IMDbDict['keywords'] = self.parseKeywords()
@ -316,8 +323,8 @@ class IMDb:
IMDbDict[key] = episode_of[key]
return self.IMDbDict
def parseCredits(self):
raw_credits = parseCredits(self.imdb)
def getCredits(self):
raw_credits = getCredits(self.imdb)
credits = {}
def getNames(creditList):
@ -489,22 +496,6 @@ class IMDb:
return ret
return {}
def getTrailer(self):
return getUrlUnicode(self.trailerUrl)
def parseTrailer(self):
ret = {}
soup = BeautifulSoup(self.getTrailer())
for p in soup('p'):
if p('a') and p.firstText():
a = p('a')[0]
href = a['href']
if href and href.startswith('http'):
title = a.string
title = title.replace('www.', '')
ret[href] = title
return ret
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]
@ -574,21 +565,6 @@ def getEpisodeData(title, episode, show_url = None):
episodeData['imdb'] = i['episodes'][episode]['imdb']
return episodeData
def getMovieStills(imdbId):
data = getUrl("http://imdb.com/gallery/ss/%s" % imdbId)
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
stills = []
for s in s_:
if int(s[0]) > int(s[1]):
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
if not stills:
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
stills = []
for s in s_:
if int(s[0]) > int(s[1]):
stills.append("http://%sf.jpg" % s[2])
return stills
if __name__ == '__main__':
import sys
#print parse(sys.argv[1])