year and title again

2007-06-22 14:14:29 +00:00 · 2007-06-22 14:14:29 +00:00 · ec63d9fb7c
commit ec63d9fb7c
parent f666c4f61d
1 changed files with 31 additions and 27 deletions
--- a/scrapeit/imdb.py
+++ b/scrapeit/imdb.py
@ -147,7 +147,35 @@ class IMDb:
      print value
      parsed_value = value
    return parsed_value
-      
+  
+  def parseTitle(self):
+    title = ''
+    data = self.getPage()
+    soup = BeautifulSoup(data)
+    html_title = soup('div', {'id': 'tn15title'})
+    if not html_title:
+      html_title = soup('title')
+    if html_title:
+      html_title = str(html_title[0])
+      title = stripTags(html_title)
+      title = re.sub('\(\d\d\d\d\)', '', title)
+    return title.strip()
+    
+  def parseYear(self):
+    year = ''
+    data = self.getPage()
+    soup = BeautifulSoup(data)
+    html_title = soup('div', {'id': 'tn15title'})
+    if not html_title:
+      html_title = soup('title')
+    if html_title:
+      html_title = str(html_title[0])
+      year = re.compile('(\d\d\d\d)').findall(html_title)
+      if year: 
+        year = year[0]
+      else: year = ''
+    return year
+  
  def parse(self):
    data = self.getPage()
    IMDbDict ={}
@ -156,32 +184,8 @@ class IMDb:
    if not IMDbDict['poster']:
      IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'  
    #Title, Year
-    title = u''
-    year  = u''
-    soup = BeautifulSoup(data)
-    html_title = soup('div', {'id': 'tn15title'})
-    if html_title: html_title = html_title[0]('h1')
-    if html_title: html_title = html_title[0].contents
-    if html_title:
-      title = html_title[0]
-      year = re.compile('(\d\d\d\d)').findall(str(html_title[1]))
-      if year: year = year[0]
-      else: year = ''
-      IMDbDict['year'] = year
-      IMDbDict['title'] = stripTags(title).strip()
-    else:
-      title = _getTerm(data, '<title>(.*?)</title>')
-      m = re.compile('\((\d+)\)').findall(title)
-      if m:
-        year = m[0]
-      else:
-        year = title.split('(')[-1].split(')')[0].strip()
-      title = title.split('(')[0].strip().decode('utf-8')
-      IMDbDict['title'] = title
-      IMDbDict['year']  = year
-    IMDbDict['title'] = htmldecode(IMDbDict['title'])
-    if IMDbDict['title'][0] == '"' and  IMDbDict['title'][-1] == '"':
-      IMDbDict['title'] =  IMDbDict['title'][1:-1]
+    IMDbDict['year'] = self.parseYear()
+    IMDbDict['title'] = self.parseTitle()
    
    #Rating
    m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)