parse Original Air Date

This commit is contained in:
j 2007-09-15 17:22:03 +00:00
parent 3e91998dfb
commit 0cf8d6f330

View file

@ -6,6 +6,7 @@ import urllib2
from urllib import quote from urllib import quote
import re, time import re, time
import os import os
import time
from elementtree.ElementTree import parse, tostring from elementtree.ElementTree import parse, tostring
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
@ -332,8 +333,7 @@ class IMDb:
def parseEpisodes(self): def parseEpisodes(self):
episodes = {} episodes = {}
cdata = self.getEpisodes().replace('\r\n', ' ') cdata = self.getEpisodes().replace('\r\n', ' ')
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>''' regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
#regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
reg = re.compile(regexp, re.IGNORECASE) reg = re.compile(regexp, re.IGNORECASE)
m = reg.findall(cdata) m = reg.findall(cdata)
for match in m: for match in m:
@ -344,9 +344,14 @@ class IMDb:
episodes[episode]['title'] = match[3].strip() episodes[episode]['title'] = match[3].strip()
if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
episodes[episode]['title'] = u'' episodes[episode]['title'] = u''
description = htmldecode(match[4]) description = htmldecode(match[5])
description = stripTags(description.split('Next US airings:')[0]) description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description episodes[episode]['description'] = description
episodes[episode]['date'] = ''
d = stripTags(match[4])
d = d.replace('Original Air Date: ', '')
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
episodes[episode]['date'] = d
except: except:
import traceback import traceback
print traceback.print_exc() print traceback.print_exc()