trailer

2007-11-30 11:39:50 +00:00 · 2007-11-30 11:39:50 +00:00 · 482599169b
commit 482599169b
parent 73ec7e7aeb
2 changed files with 49 additions and 3 deletions
--- a/scrapeit/imdb.py
+++ b/scrapeit/imdb.py
@ -110,6 +110,8 @@ class IMDb:
    self.locationUrl = "%slocations" % self.pageUrl
    self.externalreviewsSource = None
    self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
    self.trailerSource = None
    self.trailerUrl = "%strailers" % self.pageUrl
  def getPage(self, forcereload = False):
    if forcereload or not self.pageSource:
@ -272,6 +274,7 @@ class IMDb:
    IMDbDict['release_date'] = self.parseReleaseinfo()
    IMDbDict['business'] = self.parseBusiness()
    IMDbDict['reviews'] = self.parseExternalreviews()
    #IMDbDict['trailers'] = self.parseTrailers()
    self.IMDbDict = IMDbDict
    if IMDbDict['episode_of']:
@ -492,6 +495,24 @@ class IMDb:
      return ret
    return {}
  def getTrailers(self, forcereload = False):
    if forcereload or not self.trailerSource:
      self.trailerSource = read_url_utf8(self.trailerUrl)
    return self.trailerSource
  def parseTrailers(self):
    ret = {}
    soup = BeautifulSoup(self.getTrailers())
    for p in soup('p'):
      if p('a') and p.firstText():
        a = p('a')[0]
        href = a['href']
        if href and href.startswith('http'):
          title = a.string
          title = title.replace('www.', '')
          ret[href] = title
    return ret
 def guess(title, director=''):
  #FIXME: proper file -> title
  title = title.split('-')[0]
--- a/scrapeit/yahootrailers.py
+++ b/scrapeit/yahootrailers.py
@ -0,0 +1,25 @@
 # -*- coding: utf-8 -*-
 # -*- Mode: Python; -*-
 # vi:si:et:sw=2:sts=2:ts=2
 from urllib import quote
 import re
 from BeautifulSoup import BeautifulSoup
 from utils import read_url, stripTags
 def trailerByTitle(title):
  title = title.strip()
  url = "http://movies.yahoo.com/mv/search?p=%s" % quote(title)
  data = read_url(url)
  soup = BeautifulSoup(data)
  movies = soup('a', {'href': re.compile('http://movies.yahoo.com/movie.*?')})
  if movies and movies[0].firstText() and title in movies[0].firstText():
    info = movies[0]['href']
    trailer = info.replace('/info', '/video')
    data = read_url(info)
    if trailer in data:
      return trailer
  return ''