parse Original Air Date

This commit is contained in:
j 2007-09-15 17:22:03 +00:00
parent 3e91998dfb
commit 0cf8d6f330
1 changed files with 9 additions and 4 deletions

View File

@ -6,6 +6,7 @@ import urllib2
from urllib import quote
import re, time
import os
import time
from elementtree.ElementTree import parse, tostring
from BeautifulSoup import BeautifulSoup
@ -331,9 +332,8 @@ class IMDb:
def parseEpisodes(self):
episodes = {}
cdata = self.getEpisodes().replace('\r\n',' ')
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>'''
#regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
cdata = self.getEpisodes().replace('\r\n', ' ')
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
reg = re.compile(regexp, re.IGNORECASE)
m = reg.findall(cdata)
for match in m:
@ -344,9 +344,14 @@ class IMDb:
episodes[episode]['title'] = match[3].strip()
if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
episodes[episode]['title'] = u''
description = htmldecode(match[4])
description = htmldecode(match[5])
description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description
episodes[episode]['date'] = ''
d = stripTags(match[4])
d = d.replace('Original Air Date: ', '')
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
episodes[episode]['date'] = d
except:
import traceback
print traceback.print_exc()