get Trailers
This commit is contained in:
parent
69adaeee00
commit
abf263af13
1 changed files with 61 additions and 85 deletions
130
ox/imdb.py
130
ox/imdb.py
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
from oxutils import *
|
from oxutils import *
|
||||||
import urllib2
|
import urllib2
|
||||||
from urllib import quote
|
from urllib import quote, unquote
|
||||||
import re, time
|
import re, time
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
@ -39,8 +39,10 @@ def getUrlBase(imdbId):
|
||||||
def getRawMovieData(imdbId):
|
def getRawMovieData(imdbId):
|
||||||
imdbId = normalizeImdbId(imdbId)
|
imdbId = normalizeImdbId(imdbId)
|
||||||
data = dict()
|
data = dict()
|
||||||
data['credits'] = parseCredits(imdbId)
|
data['title'] = getTitle(imdbId)
|
||||||
data['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
data['credits'] = getCredits(imdbId)
|
||||||
|
data['poster'] = getPoster(imdbId)
|
||||||
|
data['trailers'] = getMovieTrailers(imdbId)
|
||||||
|
|
||||||
def parseBase(imdbId):
|
def parseBase(imdbId):
|
||||||
data = getUrl(getUrlBase(imdbId))
|
data = getUrl(getUrlBase(imdbId))
|
||||||
|
@ -59,6 +61,10 @@ def parseBase(imdbId):
|
||||||
return info
|
return info
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def getPoster(imdbId):
|
||||||
|
data = getUrl(getUrlBase(imdbId))
|
||||||
|
return findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||||
|
|
||||||
def getTitle(imdbId):
|
def getTitle(imdbId):
|
||||||
title = ''
|
title = ''
|
||||||
data = getUrl(getUrlBase(imdbId))
|
data = getUrl(getUrlBase(imdbId))
|
||||||
|
@ -96,7 +102,7 @@ def creditList(data, section=None):
|
||||||
credits.append(c)
|
credits.append(c)
|
||||||
return credits
|
return credits
|
||||||
|
|
||||||
def parseCredits(imdbId):
|
def getCredits(imdbId):
|
||||||
credits = dict()
|
credits = dict()
|
||||||
url = "%s/fullcredits" % getUrlBase(imdbId)
|
url = "%s/fullcredits" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
|
@ -107,24 +113,41 @@ def parseCredits(imdbId):
|
||||||
credits[section[0]] = creditList(g, section[0])
|
credits[section[0]] = creditList(g, section[0])
|
||||||
return credits
|
return credits
|
||||||
|
|
||||||
|
def getMovieTrailers(imdbId):
|
||||||
|
url = "%s/trailers" % getUrlBase(imdbId)
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
videos = soup('div', {'class':"video-gallery"})
|
||||||
|
trailers = []
|
||||||
|
if videos:
|
||||||
|
for a in videos[0]('a'):
|
||||||
|
title = stripTags(unicode(a)).strip()
|
||||||
|
url = 'http://www.imdb.com' + a['href']
|
||||||
|
videoId = findRegexp(url, '/(vi\d*?)/')
|
||||||
|
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
|
||||||
|
iframe = getUrlUnicode(iframeUrl)
|
||||||
|
videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
|
||||||
|
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
|
||||||
|
return trailers
|
||||||
|
|
||||||
|
def getMovieStills(imdbId):
|
||||||
|
url = "http://www.imdb.com/gallery/ss/%s" % imdbId
|
||||||
|
data = getUrlUnicode(url)
|
||||||
|
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
|
||||||
|
stills = []
|
||||||
|
for s in s_:
|
||||||
|
if int(s[0]) > int(s[1]):
|
||||||
|
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
|
||||||
|
if not stills:
|
||||||
|
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
|
||||||
|
stills = []
|
||||||
|
for s in s_:
|
||||||
|
if int(s[0]) > int(s[1]):
|
||||||
|
stills.append("http://%sf.jpg" % s[2])
|
||||||
|
return stills
|
||||||
|
|
||||||
'''the old code below'''
|
'''the old code below'''
|
||||||
|
|
||||||
def get_image(url):
|
|
||||||
return getUrl(url)
|
|
||||||
|
|
||||||
def _castList(data, regexp):
|
|
||||||
soup = re.compile(regexp).findall(data)
|
|
||||||
if soup:
|
|
||||||
soup = BeautifulSoup(soup[0])
|
|
||||||
names = []
|
|
||||||
for i in soup('a', {'href': re.compile('/name/nm')}):
|
|
||||||
if i.string:
|
|
||||||
cast = stripTags(i.string)
|
|
||||||
if cast not in names:
|
|
||||||
names.append(cast)
|
|
||||||
return names
|
|
||||||
return []
|
|
||||||
|
|
||||||
class IMDb:
|
class IMDb:
|
||||||
def __init__(self, imdbId):
|
def __init__(self, imdbId):
|
||||||
self.imdb = imdbId
|
self.imdb = imdbId
|
||||||
|
@ -140,7 +163,6 @@ class IMDb:
|
||||||
self.triviaUrl = "%strivia" % self.pageUrl
|
self.triviaUrl = "%strivia" % self.pageUrl
|
||||||
self.locationUrl = "%slocations" % self.pageUrl
|
self.locationUrl = "%slocations" % self.pageUrl
|
||||||
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
||||||
self.trailerUrl = "%strailers" % self.pageUrl
|
|
||||||
|
|
||||||
def getPage(self):
|
def getPage(self):
|
||||||
return getUrlUnicode(self.pageUrl)
|
return getUrlUnicode(self.pageUrl)
|
||||||
|
@ -163,8 +185,10 @@ class IMDb:
|
||||||
parsed_value = int(parsed_value) * 60
|
parsed_value = int(parsed_value) * 60
|
||||||
elif key in ('country', 'language'):
|
elif key in ('country', 'language'):
|
||||||
parsed_value = value.split(' / ')
|
parsed_value = value.split(' / ')
|
||||||
|
parsed_value = [v.strip() for v in parsed_value]
|
||||||
elif key == 'genre':
|
elif key == 'genre':
|
||||||
parsed_value = value.replace('more', '').strip().split(' / ')
|
parsed_value = value.replace('more', '').strip().split(' / ')
|
||||||
|
parsed_value = [v.strip() for v in parsed_value]
|
||||||
elif key == 'tagline':
|
elif key == 'tagline':
|
||||||
parsed_value = value.replace('more', '').strip()
|
parsed_value = value.replace('more', '').strip()
|
||||||
elif key == 'plot_outline':
|
elif key == 'plot_outline':
|
||||||
|
@ -187,7 +211,7 @@ class IMDb:
|
||||||
if m:
|
if m:
|
||||||
parsed_value = m[0]
|
parsed_value = m[0]
|
||||||
parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
|
parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
|
||||||
director = self.parseCredits().get('director', None)
|
director = self.getCredits().get('director', None)
|
||||||
if director:
|
if director:
|
||||||
director = director[0]
|
director = director[0]
|
||||||
parsed_value = parsed_value.replace(director, '')
|
parsed_value = parsed_value.replace(director, '')
|
||||||
|
@ -200,26 +224,9 @@ class IMDb:
|
||||||
return parsed_value
|
return parsed_value
|
||||||
|
|
||||||
def parseTitle(self):
|
def parseTitle(self):
|
||||||
title = ''
|
title = getTitle(self.imdb)
|
||||||
data = self.getPage()
|
title = normalizeTitle(title)
|
||||||
soup = BeautifulSoup(data)
|
if title.startswith('"') and title.find('"',1) > 0 and \
|
||||||
html_title = soup('div', {'id': 'tn15title'})
|
|
||||||
if not html_title:
|
|
||||||
html_title = soup('title')
|
|
||||||
if html_title:
|
|
||||||
html_title = str(html_title[0])
|
|
||||||
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
|
||||||
title = stripTags(html_title)
|
|
||||||
title = re.sub('\(\d{4}\)', '', title)
|
|
||||||
title = re.sub('\(\d{4}/I*\)', '', title)
|
|
||||||
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
|
||||||
title = title.replace(t, '')
|
|
||||||
if title.find(u'\xa0') > -1:
|
|
||||||
title = title[:title.find(u'\xa0')]
|
|
||||||
title = normalizeTitle(title.strip())
|
|
||||||
if title.startswith('"') and title.endswith('"'):
|
|
||||||
title = normalizeTitle(title[1:-1])
|
|
||||||
elif title.startswith('"') and title.find('"',1) > 0 and \
|
|
||||||
title.find('"',1) == title.rfind('"'):
|
title.find('"',1) == title.rfind('"'):
|
||||||
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
|
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
|
||||||
if se:
|
if se:
|
||||||
|
@ -252,7 +259,7 @@ class IMDb:
|
||||||
data = self.getPage()
|
data = self.getPage()
|
||||||
IMDbDict ={}
|
IMDbDict ={}
|
||||||
#Poster
|
#Poster
|
||||||
IMDbDict['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
IMDbDict['poster'] = getPoster(self.imdb)
|
||||||
if not IMDbDict['poster']:
|
if not IMDbDict['poster']:
|
||||||
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
||||||
#Title, Year
|
#Title, Year
|
||||||
|
@ -295,7 +302,7 @@ class IMDb:
|
||||||
IMDbDict['tvshow'] = True
|
IMDbDict['tvshow'] = True
|
||||||
else:
|
else:
|
||||||
IMDbDict['tvshow'] = False
|
IMDbDict['tvshow'] = False
|
||||||
IMDbDict['credits'] = self.parseCredits()
|
IMDbDict['credits'] = self.getCredits()
|
||||||
IMDbDict['plot'] = self.parsePlot()
|
IMDbDict['plot'] = self.parsePlot()
|
||||||
IMDbDict['keywords'] = self.parseKeywords()
|
IMDbDict['keywords'] = self.parseKeywords()
|
||||||
|
|
||||||
|
@ -316,8 +323,8 @@ class IMDb:
|
||||||
IMDbDict[key] = episode_of[key]
|
IMDbDict[key] = episode_of[key]
|
||||||
return self.IMDbDict
|
return self.IMDbDict
|
||||||
|
|
||||||
def parseCredits(self):
|
def getCredits(self):
|
||||||
raw_credits = parseCredits(self.imdb)
|
raw_credits = getCredits(self.imdb)
|
||||||
credits = {}
|
credits = {}
|
||||||
|
|
||||||
def getNames(creditList):
|
def getNames(creditList):
|
||||||
|
@ -489,22 +496,6 @@ class IMDb:
|
||||||
return ret
|
return ret
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def getTrailer(self):
|
|
||||||
return getUrlUnicode(self.trailerUrl)
|
|
||||||
|
|
||||||
def parseTrailer(self):
|
|
||||||
ret = {}
|
|
||||||
soup = BeautifulSoup(self.getTrailer())
|
|
||||||
for p in soup('p'):
|
|
||||||
if p('a') and p.firstText():
|
|
||||||
a = p('a')[0]
|
|
||||||
href = a['href']
|
|
||||||
if href and href.startswith('http'):
|
|
||||||
title = a.string
|
|
||||||
title = title.replace('www.', '')
|
|
||||||
ret[href] = title
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def guess(title, director=''):
|
def guess(title, director=''):
|
||||||
#FIXME: proper file -> title
|
#FIXME: proper file -> title
|
||||||
title = title.split('-')[0]
|
title = title.split('-')[0]
|
||||||
|
@ -574,21 +565,6 @@ def getEpisodeData(title, episode, show_url = None):
|
||||||
episodeData['imdb'] = i['episodes'][episode]['imdb']
|
episodeData['imdb'] = i['episodes'][episode]['imdb']
|
||||||
return episodeData
|
return episodeData
|
||||||
|
|
||||||
def getMovieStills(imdbId):
|
|
||||||
data = getUrl("http://imdb.com/gallery/ss/%s" % imdbId)
|
|
||||||
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % imdbId).findall(data)
|
|
||||||
stills = []
|
|
||||||
for s in s_:
|
|
||||||
if int(s[0]) > int(s[1]):
|
|
||||||
stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2]))
|
|
||||||
if not stills:
|
|
||||||
s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
|
|
||||||
stills = []
|
|
||||||
for s in s_:
|
|
||||||
if int(s[0]) > int(s[1]):
|
|
||||||
stills.append("http://%sf.jpg" % s[2])
|
|
||||||
return stills
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
#print parse(sys.argv[1])
|
#print parse(sys.argv[1])
|
||||||
|
|
Loading…
Reference in a new issue