get Trailers
This commit is contained in:
parent
69adaeee00
commit
abf263af13
1 changed files with 61 additions and 85 deletions
146
ox/imdb.py
146
ox/imdb.py
|
@ -4,7 +4,7 @@
|
|||
|
||||
from oxutils import *
|
||||
import urllib2
|
||||
from urllib import quote
|
||||
from urllib import quote, unquote
|
||||
import re, time
|
||||
import os
|
||||
import time
|
||||
|
@ -39,8 +39,10 @@ def getUrlBase(imdbId):
|
|||
def getRawMovieData(imdbId):
|
||||
imdbId = normalizeImdbId(imdbId)
|
||||
data = dict()
|
||||
data['credits'] = parseCredits(imdbId)
|
||||
data['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||
data['title'] = getTitle(imdbId)
|
||||
data['credits'] = getCredits(imdbId)
|
||||
data['poster'] = getPoster(imdbId)
|
||||
data['trailers'] = getMovieTrailers(imdbId)
|
||||
|
||||
def parseBase(imdbId):
|
||||
data = getUrl(getUrlBase(imdbId))
|
||||
|
@ -59,6 +61,10 @@ def parseBase(imdbId):
|
|||
return info
|
||||
return soup
|
||||
|
||||
def getPoster(imdbId):
|
||||
data = getUrl(getUrlBase(imdbId))
|
||||
return findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||
|
||||
def getTitle(imdbId):
|
||||
title = ''
|
||||
data = getUrl(getUrlBase(imdbId))
|
||||
|
@ -96,7 +102,7 @@ def creditList(data, section=None):
|
|||
credits.append(c)
|
||||
return credits
|
||||
|
||||
def parseCredits(imdbId):
|
||||
def getCredits(imdbId):
|
||||
credits = dict()
|
||||
url = "%s/fullcredits" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
|
@ -107,24 +113,41 @@ def parseCredits(imdbId):
|
|||
credits[section[0]] = creditList(g, section[0])
|
||||
return credits
|
||||
|
||||
def getMovieTrailers(imdbId):
|
||||
url = "%s/trailers" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
soup = BeautifulSoup(data)
|
||||
videos = soup('div', {'class':"video-gallery"})
|
||||
trailers = []
|
||||
if videos:
|
||||
for a in videos[0]('a'):
|
||||
title = stripTags(unicode(a)).strip()
|
||||
url = 'http://www.imdb.com' + a['href']
|
||||
videoId = findRegexp(url, '/(vi\d*?)/')
|
||||
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
|
||||
iframe = getUrlUnicode(iframeUrl)
|
||||
videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
|
||||
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
|
||||
return trailers
|
||||
|
||||
def getMovieStills(imdbId):
|
||||
url = "http://www.imdb.com/gallery/ss/%s" % imdbId
|
||||
data = getUrlUnicode(url)
|
||||
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
|
||||
stills = []
|
||||
for s in s_:
|
||||
if int(s[0]) > int(s[1]):
|
||||
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
|
||||
if not stills:
|
||||
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
|
||||
stills = []
|
||||
for s in s_:
|
||||
if int(s[0]) > int(s[1]):
|
||||
stills.append("http://%sf.jpg" % s[2])
|
||||
return stills
|
||||
|
||||
'''the old code below'''
|
||||
|
||||
def get_image(url):
|
||||
return getUrl(url)
|
||||
|
||||
def _castList(data, regexp):
|
||||
soup = re.compile(regexp).findall(data)
|
||||
if soup:
|
||||
soup = BeautifulSoup(soup[0])
|
||||
names = []
|
||||
for i in soup('a', {'href': re.compile('/name/nm')}):
|
||||
if i.string:
|
||||
cast = stripTags(i.string)
|
||||
if cast not in names:
|
||||
names.append(cast)
|
||||
return names
|
||||
return []
|
||||
|
||||
class IMDb:
|
||||
def __init__(self, imdbId):
|
||||
self.imdb = imdbId
|
||||
|
@ -140,7 +163,6 @@ class IMDb:
|
|||
self.triviaUrl = "%strivia" % self.pageUrl
|
||||
self.locationUrl = "%slocations" % self.pageUrl
|
||||
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
||||
self.trailerUrl = "%strailers" % self.pageUrl
|
||||
|
||||
def getPage(self):
|
||||
return getUrlUnicode(self.pageUrl)
|
||||
|
@ -163,8 +185,10 @@ class IMDb:
|
|||
parsed_value = int(parsed_value) * 60
|
||||
elif key in ('country', 'language'):
|
||||
parsed_value = value.split(' / ')
|
||||
parsed_value = [v.strip() for v in parsed_value]
|
||||
elif key == 'genre':
|
||||
parsed_value = value.replace('more', '').strip().split(' / ')
|
||||
parsed_value = [v.strip() for v in parsed_value]
|
||||
elif key == 'tagline':
|
||||
parsed_value = value.replace('more', '').strip()
|
||||
elif key == 'plot_outline':
|
||||
|
@ -187,7 +211,7 @@ class IMDb:
|
|||
if m:
|
||||
parsed_value = m[0]
|
||||
parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
|
||||
director = self.parseCredits().get('director', None)
|
||||
director = self.getCredits().get('director', None)
|
||||
if director:
|
||||
director = director[0]
|
||||
parsed_value = parsed_value.replace(director, '')
|
||||
|
@ -200,34 +224,17 @@ class IMDb:
|
|||
return parsed_value
|
||||
|
||||
def parseTitle(self):
|
||||
title = ''
|
||||
data = self.getPage()
|
||||
soup = BeautifulSoup(data)
|
||||
html_title = soup('div', {'id': 'tn15title'})
|
||||
if not html_title:
|
||||
html_title = soup('title')
|
||||
if html_title:
|
||||
html_title = str(html_title[0])
|
||||
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
||||
title = stripTags(html_title)
|
||||
title = re.sub('\(\d{4}\)', '', title)
|
||||
title = re.sub('\(\d{4}/I*\)', '', title)
|
||||
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||
title = title.replace(t, '')
|
||||
if title.find(u'\xa0') > -1:
|
||||
title = title[:title.find(u'\xa0')]
|
||||
title = normalizeTitle(title.strip())
|
||||
if title.startswith('"') and title.endswith('"'):
|
||||
title = normalizeTitle(title[1:-1])
|
||||
elif title.startswith('"') and title.find('"',1) > 0 and \
|
||||
title.find('"',1) == title.rfind('"'):
|
||||
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
|
||||
if se:
|
||||
se = se[0]
|
||||
se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
|
||||
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
|
||||
else:
|
||||
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
|
||||
title = getTitle(self.imdb)
|
||||
title = normalizeTitle(title)
|
||||
if title.startswith('"') and title.find('"',1) > 0 and \
|
||||
title.find('"',1) == title.rfind('"'):
|
||||
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
|
||||
if se:
|
||||
se = se[0]
|
||||
se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
|
||||
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
|
||||
else:
|
||||
title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
|
||||
return normalizeTitle(title)
|
||||
|
||||
def parseYear(self):
|
||||
|
@ -252,7 +259,7 @@ class IMDb:
|
|||
data = self.getPage()
|
||||
IMDbDict ={}
|
||||
#Poster
|
||||
IMDbDict['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||
IMDbDict['poster'] = getPoster(self.imdb)
|
||||
if not IMDbDict['poster']:
|
||||
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
||||
#Title, Year
|
||||
|
@ -295,7 +302,7 @@ class IMDb:
|
|||
IMDbDict['tvshow'] = True
|
||||
else:
|
||||
IMDbDict['tvshow'] = False
|
||||
IMDbDict['credits'] = self.parseCredits()
|
||||
IMDbDict['credits'] = self.getCredits()
|
||||
IMDbDict['plot'] = self.parsePlot()
|
||||
IMDbDict['keywords'] = self.parseKeywords()
|
||||
|
||||
|
@ -316,8 +323,8 @@ class IMDb:
|
|||
IMDbDict[key] = episode_of[key]
|
||||
return self.IMDbDict
|
||||
|
||||
def parseCredits(self):
|
||||
raw_credits = parseCredits(self.imdb)
|
||||
def getCredits(self):
|
||||
raw_credits = getCredits(self.imdb)
|
||||
credits = {}
|
||||
|
||||
def getNames(creditList):
|
||||
|
@ -489,22 +496,6 @@ class IMDb:
|
|||
return ret
|
||||
return {}
|
||||
|
||||
def getTrailer(self):
|
||||
return getUrlUnicode(self.trailerUrl)
|
||||
|
||||
def parseTrailer(self):
|
||||
ret = {}
|
||||
soup = BeautifulSoup(self.getTrailer())
|
||||
for p in soup('p'):
|
||||
if p('a') and p.firstText():
|
||||
a = p('a')[0]
|
||||
href = a['href']
|
||||
if href and href.startswith('http'):
|
||||
title = a.string
|
||||
title = title.replace('www.', '')
|
||||
ret[href] = title
|
||||
return ret
|
||||
|
||||
def guess(title, director=''):
|
||||
#FIXME: proper file -> title
|
||||
title = title.split('-')[0]
|
||||
|
@ -574,21 +565,6 @@ def getEpisodeData(title, episode, show_url = None):
|
|||
episodeData['imdb'] = i['episodes'][episode]['imdb']
|
||||
return episodeData
|
||||
|
||||
def getMovieStills(imdbId):
|
||||
data = getUrl("http://imdb.com/gallery/ss/%s" % imdbId)
|
||||
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
|
||||
stills = []
|
||||
for s in s_:
|
||||
if int(s[0]) > int(s[1]):
|
||||
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
|
||||
if not stills:
|
||||
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
|
||||
stills = []
|
||||
for s in s_:
|
||||
if int(s[0]) > int(s[1]):
|
||||
stills.append("http://%sf.jpg" % s[2])
|
||||
return stills
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
#print parse(sys.argv[1])
|
||||
|
|
Loading…
Reference in a new issue