parse Original Air Date

2007-09-15 17:22:03 +00:00 · 2007-09-15 17:22:03 +00:00 · 0cf8d6f330
commit 0cf8d6f330
parent 3e91998dfb
1 changed files with 9 additions and 4 deletions
--- a/scrapeit/imdb.py
+++ b/scrapeit/imdb.py
@ -6,6 +6,7 @@ import urllib2
 from urllib import quote
 import re, time
 import os
+import time

 from elementtree.ElementTree import parse, tostring
 from BeautifulSoup import BeautifulSoup
@ -331,9 +332,8 @@ class IMDb:
    
  def parseEpisodes(self):
    episodes = {}
-    cdata = self.getEpisodes().replace('\r\n',' ')
-    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>'''
-    #regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
+    cdata = self.getEpisodes().replace('\r\n', ' ')
+    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
    reg = re.compile(regexp, re.IGNORECASE)
    m = reg.findall(cdata)
    for match in m:
@ -344,9 +344,14 @@ class IMDb:
        episodes[episode]['title'] = match[3].strip()
        if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
          episodes[episode]['title'] = u''
-        description = htmldecode(match[4])
+        description = htmldecode(match[5])
        description = stripTags(description.split('Next US airings:')[0])
        episodes[episode]['description'] = description
+        episodes[episode]['date'] = ''
+        d = stripTags(match[4])
+        d = d.replace('Original Air Date: ', '')
+        d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
+        episodes[episode]['date'] = d
      except:
        import traceback
        print traceback.print_exc()