From 0cf8d6f330f1f096bbf277d0251d4e8a8f49b762 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sat, 15 Sep 2007 17:22:03 +0000 Subject: [PATCH] parse Original Air Date --- scrapeit/imdb.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index 7de45d7..dc198d3 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -6,6 +6,7 @@ import urllib2 from urllib import quote import re, time import os +import time from elementtree.ElementTree import parse, tostring from BeautifulSoup import BeautifulSoup @@ -331,9 +332,8 @@ class IMDb: def parseEpisodes(self): episodes = {} - cdata = self.getEpisodes().replace('\r\n',' ') - regexp = r'''

Season (.*?), Episode (.*?): (.*?)

.*?
(.*?)
''' - #regexp = r'''Season (.*?), Episode (.*?): (.*?)
.*?
(.*?)''' + cdata = self.getEpisodes().replace('\r\n', ' ') + regexp = r'''

Season (.*?), Episode (.*?): (.*?)

(.*?)
(.*?)
''' reg = re.compile(regexp, re.IGNORECASE) m = reg.findall(cdata) for match in m: @@ -344,9 +344,14 @@ class IMDb: episodes[episode]['title'] = match[3].strip() if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): episodes[episode]['title'] = u'' - description = htmldecode(match[4]) + description = htmldecode(match[5]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description + episodes[episode]['date'] = '' + d = stripTags(match[4]) + d = d.replace('Original Air Date: ', '') + d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) + episodes[episode]['date'] = d except: import traceback print traceback.print_exc()