year and title again

This commit is contained in:
j 2007-06-22 14:14:29 +00:00
parent f666c4f61d
commit ec63d9fb7c

View file

@ -147,7 +147,35 @@ class IMDb:
print value print value
parsed_value = value parsed_value = value
return parsed_value return parsed_value
def parseTitle(self):
title = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
title = stripTags(html_title)
title = re.sub('\(\d\d\d\d\)', '', title)
return title.strip()
def parseYear(self):
year = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = str(html_title[0])
year = re.compile('(\d\d\d\d)').findall(html_title)
if year:
year = year[0]
else: year = ''
return year
def parse(self): def parse(self):
data = self.getPage() data = self.getPage()
IMDbDict ={} IMDbDict ={}
@ -156,32 +184,8 @@ class IMDb:
if not IMDbDict['poster']: if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
#Title, Year #Title, Year
title = u'' IMDbDict['year'] = self.parseYear()
year = u'' IMDbDict['title'] = self.parseTitle()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if html_title: html_title = html_title[0]('h1')
if html_title: html_title = html_title[0].contents
if html_title:
title = html_title[0]
year = re.compile('(\d\d\d\d)').findall(str(html_title[1]))
if year: year = year[0]
else: year = ''
IMDbDict['year'] = year
IMDbDict['title'] = stripTags(title).strip()
else:
title = _getTerm(data, '<title>(.*?)</title>')
m = re.compile('\((\d+)\)').findall(title)
if m:
year = m[0]
else:
year = title.split('(')[-1].split(')')[0].strip()
title = title.split('(')[0].strip().decode('utf-8')
IMDbDict['title'] = title
IMDbDict['year'] = year
IMDbDict['title'] = htmldecode(IMDbDict['title'])
if IMDbDict['title'][0] == '"' and IMDbDict['title'][-1] == '"':
IMDbDict['title'] = IMDbDict['title'][1:-1]
#Rating #Rating
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data) m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)