This commit is contained in:
j 2007-11-30 11:39:50 +00:00
parent 73ec7e7aeb
commit 482599169b
2 changed files with 49 additions and 3 deletions

View File

@ -110,7 +110,9 @@ class IMDb:
self.locationUrl = "%slocations" % self.pageUrl
self.externalreviewsSource = None
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
self.trailerSource = None
self.trailerUrl = "%strailers" % self.pageUrl
def getPage(self, forcereload = False):
if forcereload or not self.pageSource:
self.pageSource = read_url_utf8(self.pageUrl)
@ -272,6 +274,7 @@ class IMDb:
IMDbDict['release_date'] = self.parseReleaseinfo()
IMDbDict['business'] = self.parseBusiness()
IMDbDict['reviews'] = self.parseExternalreviews()
#IMDbDict['trailers'] = self.parseTrailers()
self.IMDbDict = IMDbDict
if IMDbDict['episode_of']:
@ -474,7 +477,7 @@ class IMDb:
if forcereload or not self.externalreviewsSource:
self.externalreviewsSource = read_url_utf8(self.externalreviewsUrl)
return self.externalreviewsSource
def parseExternalreviews(self):
soup = BeautifulSoup(self.getExternalreviews())
ol = soup('ol')
@ -491,7 +494,25 @@ class IMDb:
pass
return ret
return {}
def getTrailers(self, forcereload = False):
if forcereload or not self.trailerSource:
self.trailerSource = read_url_utf8(self.trailerUrl)
return self.trailerSource
def parseTrailers(self):
ret = {}
soup = BeautifulSoup(self.getTrailers())
for p in soup('p'):
if p('a') and p.firstText():
a = p('a')[0]
href = a['href']
if href and href.startswith('http'):
title = a.string
title = title.replace('www.', '')
ret[href] = title
return ret
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]

25
scrapeit/yahootrailers.py Normal file
View File

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
# -*- Mode: Python; -*-
# vi:si:et:sw=2:sts=2:ts=2
from urllib import quote
import re
from BeautifulSoup import BeautifulSoup
from utils import read_url, stripTags
def trailerByTitle(title):
title = title.strip()
url = "http://movies.yahoo.com/mv/search?p=%s" % quote(title)
data = read_url(url)
soup = BeautifulSoup(data)
movies = soup('a', {'href': re.compile('http://movies.yahoo.com/movie.*?')})
if movies and movies[0].firstText() and title in movies[0].firstText():
info = movies[0]['href']
trailer = info.replace('/info', '/video')
data = read_url(info)
if trailer in data:
return trailer
return ''