year and title again

2007-06-22 14:14:29 +00:00 · 2007-06-22 14:14:29 +00:00 · ec63d9fb7c
commit ec63d9fb7c
parent f666c4f61d
1 changed files with 31 additions and 27 deletions
--- a/scrapeit/imdb.py
+++ b/scrapeit/imdb.py
@ -148,6 +148,34 @@ class IMDb:
      parsed_value = value
    return parsed_value
  def parseTitle(self):
    title = ''
    data = self.getPage()
    soup = BeautifulSoup(data)
    html_title = soup('div', {'id': 'tn15title'})
    if not html_title:
      html_title = soup('title')
    if html_title:
      html_title = str(html_title[0])
      title = stripTags(html_title)
      title = re.sub('\(\d\d\d\d\)', '', title)
    return title.strip()
  def parseYear(self):
    year = ''
    data = self.getPage()
    soup = BeautifulSoup(data)
    html_title = soup('div', {'id': 'tn15title'})
    if not html_title:
      html_title = soup('title')
    if html_title:
      html_title = str(html_title[0])
      year = re.compile('(\d\d\d\d)').findall(html_title)
      if year: 
        year = year[0]
      else: year = ''
    return year
  def parse(self):
    data = self.getPage()
    IMDbDict ={}
@ -156,32 +184,8 @@ class IMDb:
    if not IMDbDict['poster']:
      IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'  
    #Title, Year
-    title = u''
+    IMDbDict['year'] = self.parseYear()
-    year  = u''
+    IMDbDict['title'] = self.parseTitle()
    soup = BeautifulSoup(data)
    html_title = soup('div', {'id': 'tn15title'})
    if html_title: html_title = html_title[0]('h1')
    if html_title: html_title = html_title[0].contents
    if html_title:
      title = html_title[0]
      year = re.compile('(\d\d\d\d)').findall(str(html_title[1]))
      if year: year = year[0]
      else: year = ''
      IMDbDict['year'] = year
      IMDbDict['title'] = stripTags(title).strip()
    else:
      title = _getTerm(data, '<title>(.*?)</title>')
      m = re.compile('\((\d+)\)').findall(title)
      if m:
        year = m[0]
      else:
        year = title.split('(')[-1].split(')')[0].strip()
      title = title.split('(')[0].strip().decode('utf-8')
      IMDbDict['title'] = title
      IMDbDict['year']  = year
    IMDbDict['title'] = htmldecode(IMDbDict['title'])
    if IMDbDict['title'][0] == '"' and  IMDbDict['title'][-1] == '"':
      IMDbDict['title'] =  IMDbDict['title'][1:-1]
    #Rating
    m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)