trailer

2007-11-30 11:39:50 +00:00 · 2007-11-30 11:39:50 +00:00 · 482599169b
commit 482599169b
parent 73ec7e7aeb
2 changed files with 49 additions and 3 deletions
--- a/scrapeit/imdb.py
+++ b/scrapeit/imdb.py
@ -110,6 +110,8 @@ class IMDb:
    self.locationUrl = "%slocations" % self.pageUrl
    self.externalreviewsSource = None
    self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
+    self.trailerSource = None
+    self.trailerUrl = "%strailers" % self.pageUrl
    
  def getPage(self, forcereload = False):
    if forcereload or not self.pageSource:
@ -272,6 +274,7 @@ class IMDb:
    IMDbDict['release_date'] = self.parseReleaseinfo()
    IMDbDict['business'] = self.parseBusiness()
    IMDbDict['reviews'] = self.parseExternalreviews()
+    #IMDbDict['trailers'] = self.parseTrailers()
    self.IMDbDict = IMDbDict
    
    if IMDbDict['episode_of']:
@ -492,6 +495,24 @@ class IMDb:
      return ret
    return {}
  
+  def getTrailers(self, forcereload = False):
+    if forcereload or not self.trailerSource:
+      self.trailerSource = read_url_utf8(self.trailerUrl)
+    return self.trailerSource
+  
+  def parseTrailers(self):
+    ret = {}
+    soup = BeautifulSoup(self.getTrailers())
+    for p in soup('p'):
+      if p('a') and p.firstText():
+        a = p('a')[0]
+        href = a['href']
+        if href and href.startswith('http'):
+          title = a.string
+          title = title.replace('www.', '')
+          ret[href] = title
+    return ret
+
 def guess(title, director=''):
  #FIXME: proper file -> title
  title = title.split('-')[0]
--- a/scrapeit/yahootrailers.py
+++ b/scrapeit/yahootrailers.py
@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+# -*- Mode: Python; -*-
+# vi:si:et:sw=2:sts=2:ts=2
+from urllib import quote
+import re
+
+from BeautifulSoup import BeautifulSoup
+
+from utils import read_url, stripTags
+
+
+def trailerByTitle(title):
+  title = title.strip()
+  url = "http://movies.yahoo.com/mv/search?p=%s" % quote(title)
+  data = read_url(url)
+  soup = BeautifulSoup(data)
+  movies = soup('a', {'href': re.compile('http://movies.yahoo.com/movie.*?')})
+  if movies and movies[0].firstText() and title in movies[0].firstText():
+    info = movies[0]['href']
+    trailer = info.replace('/info', '/video')
+    data = read_url(info)
+    if trailer in data:
+      return trailer
+  return ''
+