From cde21b03633d207491988c4757beeb732ceb2322 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sat, 5 Jul 2008 15:35:46 +0200 Subject: [PATCH] parse title and add some tests --- oxweb/dailymotion.py | 8 ++--- oxweb/imdb.py | 76 ++++++++++++++++++++++++++++++-------------- 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/oxweb/dailymotion.py b/oxweb/dailymotion.py index 7f9f3cb..c3bc2e7 100644 --- a/oxweb/dailymotion.py +++ b/oxweb/dailymotion.py @@ -7,11 +7,11 @@ from oxlib.cache import getUrl def getVideoUrl(url): ''' - >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms') - 'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0' + >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?key')[0] + 'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv' - >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms') - 'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4' + >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?key')[0] + 'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv' ''' data = getUrl(url) video = re.compile('''video", "(.*?)"''').findall(data) diff --git a/oxweb/imdb.py b/oxweb/imdb.py index 66fc503..08a315b 100644 --- a/oxweb/imdb.py +++ b/oxweb/imdb.py @@ -74,14 +74,16 @@ def getMovieInfo(imdbId): if k.endswith('more'): k=k[:-len('more')].strip() return k txt = cleanUp(txt) - if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline'): + if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline', 'original air date'): if '|' in txt: txt = [cleanUp(k) for k in txt.split('|')] elif ', ' in txt: txt = [cleanUp(k) for k in txt.split(', ')] + if title == 'original air date': + txt = txt.split('\n')[0].strip() if not title.startswith('moviemeter'): info[title] = txt - for key in ('user comments', 'writers (wga)'): + for key in ('user comments', 'writers (wga)', 'plot keywords'): if key in info: del info[key] if 'release date' in info: @@ -115,8 +117,30 @@ def getMovieInfo(imdbId): title = title[:title.find(u'\xa0')].strip() if title.startswith('"') and title.endswith('"'): title = title[1:-1] - info['title'] = title + info['title'] = normalizeTitle(title) info['year'] = year + if title.startswith('"') and title.find('"',1) > 0 and \ + title.find('"',1) == title.rfind('"'): + episode_title = title[title.rfind('"')+1:] + episode_title = re.sub("\?{4}", "", episode_title).strip() + episode_title = re.sub("\d{4}", "", episode_title).strip() + if episode_title == '-': episode_title='' + title = normalizeTitle(title[1:title.rfind('"')]) + if episode_title: + info['episode title'] = episode_title + info['series title'] = title + info['title'] = "%s: %s" % (title, episode_title) + else: + info['title'] = title + + #Series + se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) + if se: + info['season'] = int(se[0][0]) + info['episode'] = int(se[0][1]) + info['title'] = "%s (S%02dE%02d) %s" % ( + info['series title'], info['season'], info['episode'], info['episode title']) + info['title'] = info['title'].strip() #Rating rating = findRe(data, '([\d\.]*?)/10') @@ -131,17 +155,42 @@ def getMovieInfo(imdbId): info['votes'] = int(votes.replace(',', '')) else: info['votes'] = -1 + return info + def getMoviePoster(imdbId): info = getMovieInfo(imdbId) return info['poster'] def getMovieYear(imdbId): + ''' + >>> getMovieYear('0315404') + u'1964' + + >>> getMovieYear('0734840') + u'1990' + + >>> getMovieYear('0815352') + u'1964' + ''' info = getMovieInfo(imdbId) return info['year'] def getMovieTitle(imdbId): + ''' + >>> getMovieTitle('0306414') + u'The Wire' + + >>> getMovieTitle('0734840') + u'Twin Peaks (S01E02) Episode #1.2' + + >>> getMovieTitle('0734840') + u'Twin Peaks (S01E02) Episode #1.2' + + >>> getMovieTitle('0749451') + u'The Wire (S01E01) The Target' + ''' info = getMovieInfo(imdbId) return info['title'] @@ -474,25 +523,6 @@ class IMDb: parsed_value = value return parsed_value - def parseTitle(self): - title = getMovieTitle(self.imdb) - title = normalizeTitle(title) - if title.startswith('"') and title.find('"',1) > 0 and \ - title.find('"',1) == title.rfind('"'): - data = self.getPage() - se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) - if se: - se = se[0] - se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1])) - title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip() - else: - part2 = title[title.rfind('"')+1:] - part2 = re.sub("[\d\?-]", "", part2).strip() - title = normalizeTitle(title[1:title.rfind('"')]) - if part2: - title += ':' + part2 - return normalizeTitle(title) - def parseYear(self): year = '' data = self.getPage() @@ -520,7 +550,7 @@ class IMDb: IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' #Title, Year IMDbDict['year'] = self.parseYear() - IMDbDict['title'] = self.parseTitle() + IMDbDict['title'] = getMovieTitle(self.imdb) #Rating m = re.compile('(.*?)/10', re.IGNORECASE).search(data)