From abf263af138679121d01aed5268d10198cb9c92d Mon Sep 17 00:00:00 2001 From: j Date: Tue, 29 Apr 2008 21:09:10 +0200 Subject: [PATCH] get Trailers --- ox/imdb.py | 146 ++++++++++++++++++++++------------------------------- 1 file changed, 61 insertions(+), 85 deletions(-) diff --git a/ox/imdb.py b/ox/imdb.py index bcb77e1..8fc9ade 100644 --- a/ox/imdb.py +++ b/ox/imdb.py @@ -4,7 +4,7 @@ from oxutils import * import urllib2 -from urllib import quote +from urllib import quote, unquote import re, time import os import time @@ -39,8 +39,10 @@ def getUrlBase(imdbId): def getRawMovieData(imdbId): imdbId = normalizeImdbId(imdbId) data = dict() - data['credits'] = parseCredits(imdbId) - data['poster'] = findRegexp(data, 'name="poster".*? int(s[1]): + stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2])) + if not stills: + s_ = re.compile(''' int(s[1]): + stills.append("http://%sf.jpg" % s[2]) + return stills + '''the old code below''' -def get_image(url): - return getUrl(url) - -def _castList(data, regexp): - soup = re.compile(regexp).findall(data) - if soup: - soup = BeautifulSoup(soup[0]) - names = [] - for i in soup('a', {'href': re.compile('/name/nm')}): - if i.string: - cast = stripTags(i.string) - if cast not in names: - names.append(cast) - return names - return [] - class IMDb: def __init__(self, imdbId): self.imdb = imdbId @@ -140,7 +163,6 @@ class IMDb: self.triviaUrl = "%strivia" % self.pageUrl self.locationUrl = "%slocations" % self.pageUrl self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl - self.trailerUrl = "%strailers" % self.pageUrl def getPage(self): return getUrlUnicode(self.pageUrl) @@ -163,8 +185,10 @@ class IMDb: parsed_value = int(parsed_value) * 60 elif key in ('country', 'language'): parsed_value = value.split(' / ') + parsed_value = [v.strip() for v in parsed_value] elif key == 'genre': parsed_value = value.replace('more', '').strip().split(' / ') + parsed_value = [v.strip() for v in parsed_value] elif key == 'tagline': parsed_value = value.replace('more', '').strip() elif key == 'plot_outline': @@ -187,7 +211,7 @@ class IMDb: if m: parsed_value = m[0] parsed_value = parsed_value.split('
')[-1].split('(')[0] - director = self.parseCredits().get('director', None) + director = self.getCredits().get('director', None) if director: director = director[0] parsed_value = parsed_value.replace(director, '') @@ -200,34 +224,17 @@ class IMDb: return parsed_value def parseTitle(self): - title = '' - data = self.getPage() - soup = BeautifulSoup(data) - html_title = soup('div', {'id': 'tn15title'}) - if not html_title: - html_title = soup('title') - if html_title: - html_title = str(html_title[0]) - html_title = html_title.replace('
', ' ').replace(' ', ' ') - title = stripTags(html_title) - title = re.sub('\(\d{4}\)', '', title) - title = re.sub('\(\d{4}/I*\)', '', title) - for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): - title = title.replace(t, '') - if title.find(u'\xa0') > -1: - title = title[:title.find(u'\xa0')] - title = normalizeTitle(title.strip()) - if title.startswith('"') and title.endswith('"'): - title = normalizeTitle(title[1:-1]) - elif title.startswith('"') and title.find('"',1) > 0 and \ - title.find('"',1) == title.rfind('"'): - se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) - if se: - se = se[0] - se = ' (S%02dE%02d)' % (int(se[0]), int(se[1])) - title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] - else: - title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] + title = getTitle(self.imdb) + title = normalizeTitle(title) + if title.startswith('"') and title.find('"',1) > 0 and \ + title.find('"',1) == title.rfind('"'): + se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) + if se: + se = se[0] + se = ' (S%02dE%02d)' % (int(se[0]), int(se[1])) + title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] + else: + title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] return normalizeTitle(title) def parseYear(self): @@ -252,7 +259,7 @@ class IMDb: data = self.getPage() IMDbDict ={} #Poster - IMDbDict['poster'] = findRegexp(data, 'name="poster".*? title title = title.split('-')[0] @@ -574,21 +565,6 @@ def getEpisodeData(title, episode, show_url = None): episodeData['imdb'] = i['episodes'][episode]['imdb'] return episodeData -def getMovieStills(imdbId): - data = getUrl("http://imdb.com/gallery/ss/%s" % imdbId) - s_ = re.compile(''' int(s[1]): - stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (imdbId, s[2])) - if not stills: - s_ = re.compile(''' int(s[1]): - stills.append("http://%sf.jpg" % s[2]) - return stills - if __name__ == '__main__': import sys #print parse(sys.argv[1])