reduce parsing time drastically thanks to updated encoding detection and removing BeautifulSoup
This commit is contained in:
parent
5cdd4ed63b
commit
8886cfe8d3
1 changed files with 20 additions and 18 deletions
38
ox/imdb.py
38
ox/imdb.py
|
@ -18,6 +18,18 @@ from oxutils.normalize import normalizeTitle, normalizeImdbId
|
||||||
|
|
||||||
import google
|
import google
|
||||||
|
|
||||||
|
_timer = -1
|
||||||
|
_timer_last = -1
|
||||||
|
def debugTime(message=''):
|
||||||
|
global _timer, _timer_last
|
||||||
|
if _timer == -1:
|
||||||
|
_timer = time.time()
|
||||||
|
if _timer_last == -1:
|
||||||
|
_timer_last = time.time()
|
||||||
|
now = time.time()
|
||||||
|
print message," since last: %0.2f total time: %0.2f" % (now-_timer_last, now-_timer)
|
||||||
|
_timer_last = now
|
||||||
|
|
||||||
def getMovieId(title, director='', year=''):
|
def getMovieId(title, director='', year=''):
|
||||||
'''
|
'''
|
||||||
>>> getMovieId('The Matrix')
|
>>> getMovieId('The Matrix')
|
||||||
|
@ -46,7 +58,6 @@ def getRawMovieData(imdbId):
|
||||||
data = getMovieInfo(imdbId)
|
data = getMovieInfo(imdbId)
|
||||||
data['credits'] = getMovieCredits(imdbId)
|
data['credits'] = getMovieCredits(imdbId)
|
||||||
data['poster'] = getMoviePoster(imdbId)
|
data['poster'] = getMoviePoster(imdbId)
|
||||||
data['connections'] = getMovieConnections(imdbId)
|
|
||||||
data['company credits'] = getMovieCompanyCredits(imdbId)
|
data['company credits'] = getMovieCompanyCredits(imdbId)
|
||||||
data['filming locations'] = getMovieLocations(imdbId)
|
data['filming locations'] = getMovieLocations(imdbId)
|
||||||
data['movie connections'] = getMovieConnections(imdbId)
|
data['movie connections'] = getMovieConnections(imdbId)
|
||||||
|
@ -272,28 +283,20 @@ def getMovieTrivia(imdbId):
|
||||||
|
|
||||||
def getMovieConnections(imdbId):
|
def getMovieConnections(imdbId):
|
||||||
url = "%s/movieconnections" % getUrlBase(imdbId)
|
url = "%s/movieconnections" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrl(url)
|
||||||
soup = BeautifulSoup(data)
|
connections={}
|
||||||
connections = {}
|
for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
|
||||||
content = soup('div', {'id': 'tn15content'})[0]
|
connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
|
||||||
blocks = unicode(content).split('<h5>')[1:]
|
|
||||||
for c in blocks:
|
|
||||||
connection = c.split('</h5>')[0]
|
|
||||||
cs = BeautifulSoup(c)
|
|
||||||
if connection:
|
|
||||||
#relation -> list of imdb ids
|
|
||||||
connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
|
|
||||||
return connections
|
return connections
|
||||||
|
|
||||||
def getMovieKeywords(imdbId):
|
def getMovieKeywords(imdbId):
|
||||||
url = "%s/keywords" % getUrlBase(imdbId)
|
url = "%s/keywords" % getUrlBase(imdbId)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
soup = BeautifulSoup(data)
|
|
||||||
keywords = []
|
keywords = []
|
||||||
for key in soup('a', {'href': re.compile('^/keyword/')}):
|
for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
|
||||||
k = decodeHtml(key.string)
|
keyword = decodeHtml(keyword)
|
||||||
k = k.replace(u'\xa0', ' ')
|
keyword = keyword.replace(u'\xa0', ' ')
|
||||||
keywords.append(k)
|
keywords.append(keyword)
|
||||||
return keywords
|
return keywords
|
||||||
|
|
||||||
def getMovieExternalReviews(imdbId):
|
def getMovieExternalReviews(imdbId):
|
||||||
|
@ -591,7 +594,6 @@ class IMDb:
|
||||||
IMDbDict['credits'] = self.getCredits()
|
IMDbDict['credits'] = self.getCredits()
|
||||||
IMDbDict['plot'] = getMoviePlot(self.imdb)
|
IMDbDict['plot'] = getMoviePlot(self.imdb)
|
||||||
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
|
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
|
||||||
|
|
||||||
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
||||||
IMDbDict['connections'] = getMovieConnections(self.imdb)
|
IMDbDict['connections'] = getMovieConnections(self.imdb)
|
||||||
IMDbDict['locations'] = getMovieLocations(self.imdb)
|
IMDbDict['locations'] = getMovieLocations(self.imdb)
|
||||||
|
|
Loading…
Reference in a new issue