scrapeit/scrapeit/tvcom.py

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2

import re

from BeautifulSoup import BeautifulSoup

from utils import read_url_utf8, stripTags

def getEpisodeData(url):
  ''' prases informatin on tvcom episode pages
      returns dict with title, show, description, score
  '''
  tvcom = {
    'description': u''
  }
  data = read_url_utf8(url).replace('\n',' ')
  regexp = r'''<div id="main-col">.*?<div>(.*?)<div class="ta-r mt-10 f-bold">'''
  reg = re.compile(regexp, re.IGNORECASE)
  m = reg.findall(data)
  for match in m:
    description = match.strip()
    description = stripTags(description).replace('Watch Video','')
    tvcom['description'] = description.strip()
  soup = BeautifulSoup(data)
  #optional data
  try:
    tvcom['show'] = soup('h1')[0].contents[0]
    tvcom['title'] = soup('h1')[1].contents[0]
    tvcom['score'] = soup("span", {'class':"f-28 f-bold mt-10 mb-10 f-FF9 db lh-18"})[0].contents[0]
  except:
    pass
  return tvcom