.*?: (.*?) - TV.com

# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re import time from oxlib import stripTags, findRe from oxlib.cache import readUrlUnicode def getEpisodeData(url): ''' prases informatin on tvcom episode pages returns dict with title, show, description, score example: getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html') ''' data = readUrlUnicode(url) r = {} r['description'] = stripTags(findRe(data, 'div id="main-col">.*?

(.*?)(.*?)') r['title'] = findRe(data, '.*?: (.*?) - TV.com ') #episode score r['episode score'] = findRe(data, '(.*?)') match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?) ').findall(data) if match: r['season'] = int(match[0][1]) r['episode'] = int(match[0][0]) #'Wednesday September 29, 2004' -> 2004-09-29 r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y')) return r