From ec63d9fb7c18cc4c3a5b7e318da9cfe53916e46d Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Fri, 22 Jun 2007 14:14:29 +0000 Subject: [PATCH] year and title again --- scrapeit/imdb.py | 58 ++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index 5f83ad7..4e6f1a4 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -147,7 +147,35 @@ class IMDb: print value parsed_value = value return parsed_value - + + def parseTitle(self): + title = '' + data = self.getPage() + soup = BeautifulSoup(data) + html_title = soup('div', {'id': 'tn15title'}) + if not html_title: + html_title = soup('title') + if html_title: + html_title = str(html_title[0]) + title = stripTags(html_title) + title = re.sub('\(\d\d\d\d\)', '', title) + return title.strip() + + def parseYear(self): + year = '' + data = self.getPage() + soup = BeautifulSoup(data) + html_title = soup('div', {'id': 'tn15title'}) + if not html_title: + html_title = soup('title') + if html_title: + html_title = str(html_title[0]) + year = re.compile('(\d\d\d\d)').findall(html_title) + if year: + year = year[0] + else: year = '' + return year + def parse(self): data = self.getPage() IMDbDict ={} @@ -156,32 +184,8 @@ class IMDb: if not IMDbDict['poster']: IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' #Title, Year - title = u'' - year = u'' - soup = BeautifulSoup(data) - html_title = soup('div', {'id': 'tn15title'}) - if html_title: html_title = html_title[0]('h1') - if html_title: html_title = html_title[0].contents - if html_title: - title = html_title[0] - year = re.compile('(\d\d\d\d)').findall(str(html_title[1])) - if year: year = year[0] - else: year = '' - IMDbDict['year'] = year - IMDbDict['title'] = stripTags(title).strip() - else: - title = _getTerm(data, '(.*?)') - m = re.compile('\((\d+)\)').findall(title) - if m: - year = m[0] - else: - year = title.split('(')[-1].split(')')[0].strip() - title = title.split('(')[0].strip().decode('utf-8') - IMDbDict['title'] = title - IMDbDict['year'] = year - IMDbDict['title'] = htmldecode(IMDbDict['title']) - if IMDbDict['title'][0] == '"' and IMDbDict['title'][-1] == '"': - IMDbDict['title'] = IMDbDict['title'][1:-1] + IMDbDict['year'] = self.parseYear() + IMDbDict['title'] = self.parseTitle() #Rating m = re.compile('(.*?)/10', re.IGNORECASE).search(data)