year and title again
This commit is contained in:
parent
f666c4f61d
commit
ec63d9fb7c
1 changed files with 31 additions and 27 deletions
|
@ -147,7 +147,35 @@ class IMDb:
|
||||||
print value
|
print value
|
||||||
parsed_value = value
|
parsed_value = value
|
||||||
return parsed_value
|
return parsed_value
|
||||||
|
|
||||||
|
def parseTitle(self):
|
||||||
|
title = ''
|
||||||
|
data = self.getPage()
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
html_title = soup('div', {'id': 'tn15title'})
|
||||||
|
if not html_title:
|
||||||
|
html_title = soup('title')
|
||||||
|
if html_title:
|
||||||
|
html_title = str(html_title[0])
|
||||||
|
title = stripTags(html_title)
|
||||||
|
title = re.sub('\(\d\d\d\d\)', '', title)
|
||||||
|
return title.strip()
|
||||||
|
|
||||||
|
def parseYear(self):
|
||||||
|
year = ''
|
||||||
|
data = self.getPage()
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
html_title = soup('div', {'id': 'tn15title'})
|
||||||
|
if not html_title:
|
||||||
|
html_title = soup('title')
|
||||||
|
if html_title:
|
||||||
|
html_title = str(html_title[0])
|
||||||
|
year = re.compile('(\d\d\d\d)').findall(html_title)
|
||||||
|
if year:
|
||||||
|
year = year[0]
|
||||||
|
else: year = ''
|
||||||
|
return year
|
||||||
|
|
||||||
def parse(self):
|
def parse(self):
|
||||||
data = self.getPage()
|
data = self.getPage()
|
||||||
IMDbDict ={}
|
IMDbDict ={}
|
||||||
|
@ -156,32 +184,8 @@ class IMDb:
|
||||||
if not IMDbDict['poster']:
|
if not IMDbDict['poster']:
|
||||||
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
||||||
#Title, Year
|
#Title, Year
|
||||||
title = u''
|
IMDbDict['year'] = self.parseYear()
|
||||||
year = u''
|
IMDbDict['title'] = self.parseTitle()
|
||||||
soup = BeautifulSoup(data)
|
|
||||||
html_title = soup('div', {'id': 'tn15title'})
|
|
||||||
if html_title: html_title = html_title[0]('h1')
|
|
||||||
if html_title: html_title = html_title[0].contents
|
|
||||||
if html_title:
|
|
||||||
title = html_title[0]
|
|
||||||
year = re.compile('(\d\d\d\d)').findall(str(html_title[1]))
|
|
||||||
if year: year = year[0]
|
|
||||||
else: year = ''
|
|
||||||
IMDbDict['year'] = year
|
|
||||||
IMDbDict['title'] = stripTags(title).strip()
|
|
||||||
else:
|
|
||||||
title = _getTerm(data, '<title>(.*?)</title>')
|
|
||||||
m = re.compile('\((\d+)\)').findall(title)
|
|
||||||
if m:
|
|
||||||
year = m[0]
|
|
||||||
else:
|
|
||||||
year = title.split('(')[-1].split(')')[0].strip()
|
|
||||||
title = title.split('(')[0].strip().decode('utf-8')
|
|
||||||
IMDbDict['title'] = title
|
|
||||||
IMDbDict['year'] = year
|
|
||||||
IMDbDict['title'] = htmldecode(IMDbDict['title'])
|
|
||||||
if IMDbDict['title'][0] == '"' and IMDbDict['title'][-1] == '"':
|
|
||||||
IMDbDict['title'] = IMDbDict['title'][1:-1]
|
|
||||||
|
|
||||||
#Rating
|
#Rating
|
||||||
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
|
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
|
||||||
|
|
Loading…
Reference in a new issue