# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import urllib2 from urllib import quote, unquote import re import os import time import chardet import oxlib from oxlib import stripTags, decodeHtml, findRe, findString import oxlib.cache from oxlib.normalize import normalizeTitle, normalizeImdbId from oxlib import * import google ''' never timeout imdb data, to update cache remove data from cache folder ''' def readUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1): return oxlib.cache.readUrlUnicode(url, data, headers, timeout) ''' check if result is valid while updating def validate(result, header): return header['status'] == u'200' try: d = oxlib.cache.readUrlUnicode(url, data, headers, timeout=0, valid=validate) except oxlib.cache.InvalidResult, e: print e.headers ''' def getMovieId(title, director='', year=''): ''' >>> getMovieId('The Matrix') '0133093' ''' if year: title = "%s (%s)" % (title, year) if director: query = 'site:imdb.com %s "%s"' % (director, title) else: query = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(query, 3, timeout=-1): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] return '' def getMovieData(imdbId): return IMDb(imdbId).parse() # internal functions below def getUrlBase(imdbId): return "http://www.imdb.com/title/tt%s/" % imdbId def getRawMovieData(imdbId): imdbId = normalizeImdbId(imdbId) data = getMovieInfo(imdbId) data['credits'] = getMovieCredits(imdbId) data['poster'] = getMoviePoster(imdbId) data['company credits'] = getMovieCompanyCredits(imdbId) data['filming locations'] = getMovieLocations(imdbId) data['movie connections'] = getMovieConnections(imdbId) data['external reviews'] = getMovieExternalReviews(imdbId) data['trivia'] = getMovieTrivia(imdbId) data['keywords'] = getMovieKeywords(imdbId) data['media'] = {} data['media']['images'] = getMovieImages(imdbId) data['media']['trailers'] = getMovieTrailers(imdbId) data['plotsummary'] = getMoviePlot(imdbId) data['release dates'] = getMovieReleaseDates(imdbId) data['release date'] = getMovieReleaseDate(imdbId) return data def getMovieInfo(imdbId, timeout=-1): data = readUrlUnicode(getUrlBase(imdbId), timeout=timeout) info = dict() info['poster'] = findRe(data, 'name="poster".*?(.*?):(.*?)