scrapeit/scrapeit/tvcom.py
2007-03-01 15:11:35 +00:00

34 lines
984 B
Python

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import re
from BeautifulSoup import BeautifulSoup
from utils import read_url_utf8, stripTags
def getEpisodeData(url):
''' prases informatin on tvcom episode pages
returns dict with title, show, description, score
'''
tvcom = {
'description': u''
}
data = read_url_utf8(url).replace('\n',' ')
regexp = r'''<div id="main-col">.*?<div>(.*?)<div class="ta-r mt-10 f-bold">'''
reg = re.compile(regexp, re.IGNORECASE)
m = reg.findall(data)
for match in m:
description = match.strip()
description = stripTags(description).replace('Watch Video','')
tvcom['description'] = description.strip()
soup = BeautifulSoup(data)
#optional data
try:
tvcom['show'] = soup('h1')[0].contents[0]
tvcom['title'] = soup('h1')[1].contents[0]
tvcom['score'] = soup("span", {'class':"f-28 f-bold mt-10 mb-10 f-FF9 db lh-18"})[0].contents[0]
except:
pass
return tvcom