trailer
This commit is contained in:
parent
73ec7e7aeb
commit
482599169b
2 changed files with 49 additions and 3 deletions
|
@ -110,7 +110,9 @@ class IMDb:
|
||||||
self.locationUrl = "%slocations" % self.pageUrl
|
self.locationUrl = "%slocations" % self.pageUrl
|
||||||
self.externalreviewsSource = None
|
self.externalreviewsSource = None
|
||||||
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
|
||||||
|
self.trailerSource = None
|
||||||
|
self.trailerUrl = "%strailers" % self.pageUrl
|
||||||
|
|
||||||
def getPage(self, forcereload = False):
|
def getPage(self, forcereload = False):
|
||||||
if forcereload or not self.pageSource:
|
if forcereload or not self.pageSource:
|
||||||
self.pageSource = read_url_utf8(self.pageUrl)
|
self.pageSource = read_url_utf8(self.pageUrl)
|
||||||
|
@ -272,6 +274,7 @@ class IMDb:
|
||||||
IMDbDict['release_date'] = self.parseReleaseinfo()
|
IMDbDict['release_date'] = self.parseReleaseinfo()
|
||||||
IMDbDict['business'] = self.parseBusiness()
|
IMDbDict['business'] = self.parseBusiness()
|
||||||
IMDbDict['reviews'] = self.parseExternalreviews()
|
IMDbDict['reviews'] = self.parseExternalreviews()
|
||||||
|
#IMDbDict['trailers'] = self.parseTrailers()
|
||||||
self.IMDbDict = IMDbDict
|
self.IMDbDict = IMDbDict
|
||||||
|
|
||||||
if IMDbDict['episode_of']:
|
if IMDbDict['episode_of']:
|
||||||
|
@ -474,7 +477,7 @@ class IMDb:
|
||||||
if forcereload or not self.externalreviewsSource:
|
if forcereload or not self.externalreviewsSource:
|
||||||
self.externalreviewsSource = read_url_utf8(self.externalreviewsUrl)
|
self.externalreviewsSource = read_url_utf8(self.externalreviewsUrl)
|
||||||
return self.externalreviewsSource
|
return self.externalreviewsSource
|
||||||
|
|
||||||
def parseExternalreviews(self):
|
def parseExternalreviews(self):
|
||||||
soup = BeautifulSoup(self.getExternalreviews())
|
soup = BeautifulSoup(self.getExternalreviews())
|
||||||
ol = soup('ol')
|
ol = soup('ol')
|
||||||
|
@ -491,7 +494,25 @@ class IMDb:
|
||||||
pass
|
pass
|
||||||
return ret
|
return ret
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def getTrailers(self, forcereload = False):
|
||||||
|
if forcereload or not self.trailerSource:
|
||||||
|
self.trailerSource = read_url_utf8(self.trailerUrl)
|
||||||
|
return self.trailerSource
|
||||||
|
|
||||||
|
def parseTrailers(self):
|
||||||
|
ret = {}
|
||||||
|
soup = BeautifulSoup(self.getTrailers())
|
||||||
|
for p in soup('p'):
|
||||||
|
if p('a') and p.firstText():
|
||||||
|
a = p('a')[0]
|
||||||
|
href = a['href']
|
||||||
|
if href and href.startswith('http'):
|
||||||
|
title = a.string
|
||||||
|
title = title.replace('www.', '')
|
||||||
|
ret[href] = title
|
||||||
|
return ret
|
||||||
|
|
||||||
def guess(title, director=''):
|
def guess(title, director=''):
|
||||||
#FIXME: proper file -> title
|
#FIXME: proper file -> title
|
||||||
title = title.split('-')[0]
|
title = title.split('-')[0]
|
||||||
|
|
25
scrapeit/yahootrailers.py
Normal file
25
scrapeit/yahootrailers.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# -*- Mode: Python; -*-
|
||||||
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
from urllib import quote
|
||||||
|
import re
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
from utils import read_url, stripTags
|
||||||
|
|
||||||
|
|
||||||
|
def trailerByTitle(title):
|
||||||
|
title = title.strip()
|
||||||
|
url = "http://movies.yahoo.com/mv/search?p=%s" % quote(title)
|
||||||
|
data = read_url(url)
|
||||||
|
soup = BeautifulSoup(data)
|
||||||
|
movies = soup('a', {'href': re.compile('http://movies.yahoo.com/movie.*?')})
|
||||||
|
if movies and movies[0].firstText() and title in movies[0].firstText():
|
||||||
|
info = movies[0]['href']
|
||||||
|
trailer = info.replace('/info', '/video')
|
||||||
|
data = read_url(info)
|
||||||
|
if trailer in data:
|
||||||
|
return trailer
|
||||||
|
return ''
|
||||||
|
|
Loading…
Reference in a new issue