2010-07-07 23:25:57 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
|
|
import re
|
|
|
|
import time
|
|
|
|
|
2012-08-14 14:12:43 +00:00
|
|
|
from ox import strip_tags, find_re
|
2012-08-14 13:58:05 +00:00
|
|
|
from ox.cache import read_url
|
2010-07-07 23:25:57 +00:00
|
|
|
|
|
|
|
|
2012-08-15 15:15:40 +00:00
|
|
|
def get_episode_data(url):
|
2010-07-07 23:25:57 +00:00
|
|
|
'''
|
|
|
|
prases informatin on tvcom episode pages
|
|
|
|
returns dict with title, show, description, score
|
|
|
|
example:
|
2012-08-15 15:15:40 +00:00
|
|
|
get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
2010-07-07 23:25:57 +00:00
|
|
|
'''
|
2012-08-14 13:58:05 +00:00
|
|
|
data = read_url(url, unicode=True)
|
2010-07-07 23:25:57 +00:00
|
|
|
r = {}
|
2012-08-14 14:12:43 +00:00
|
|
|
r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
|
|
|
r['show'] = find_re(data, '<h1>(.*?)</h1>')
|
|
|
|
r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
|
2010-07-07 23:25:57 +00:00
|
|
|
#episode score
|
2012-08-14 14:12:43 +00:00
|
|
|
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
2010-07-07 23:25:57 +00:00
|
|
|
|
2024-09-11 21:52:01 +00:00
|
|
|
match = re.compile(r'Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
|
2010-07-07 23:25:57 +00:00
|
|
|
if match:
|
|
|
|
r['season'] = int(match[0][1])
|
|
|
|
r['episode'] = int(match[0][0])
|
|
|
|
#'Wednesday September 29, 2004' -> 2004-09-29
|
|
|
|
r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
|
|
|
|
return r
|
|
|
|
|