# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import urllib2 from urllib import quote, unquote import re import os import time from BeautifulSoup import BeautifulSoup import chardet import oxlib from oxlib import stripTags, decodeHtml, findRe, findString import oxlib.cache from oxlib.normalize import normalizeTitle, normalizeImdbId from oxlib import * import google ''' never timeout imdb data, to update cache remove data from cache folder ''' def readUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1): return oxlib.cache.readUrlUnicode(url, data, headers, timeout) ''' check if result is valid while updating def validate(result, header): return header['status'] == u'200' try: d = oxlib.cache.readUrlUnicode(url, data, headers, timeout=0, valid=validate) except oxlib.cache.InvalidResult, e: print e.headers ''' def getMovieId(title, director='', year=''): ''' >>> getMovieId('The Matrix') '0133093' ''' if year: title = "%s (%s)" % (title, year) if director: query = 'site:imdb.com %s "%s"' % (director, title) else: query = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(query, 3, timeout=-1): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] return '' def getMovieData(imdbId): return IMDb(imdbId).parse() # internal functions below def getUrlBase(imdbId): return "http://www.imdb.com/title/tt%s/" % imdbId def getRawMovieData(imdbId): imdbId = normalizeImdbId(imdbId) data = getMovieInfo(imdbId) data['credits'] = getMovieCredits(imdbId) data['poster'] = getMoviePoster(imdbId) data['company credits'] = getMovieCompanyCredits(imdbId) data['filming locations'] = getMovieLocations(imdbId) data['movie connections'] = getMovieConnections(imdbId) data['external reviews'] = getMovieExternalReviews(imdbId) data['trivia'] = getMovieTrivia(imdbId) data['keywords'] = getMovieKeywords(imdbId) data['media'] = {} data['media']['images'] = getMovieImages(imdbId) data['media']['trailers'] = getMovieTrailers(imdbId) data['plotsummary'] = getMoviePlot(imdbId) data['release dates'] = getMovieReleaseDates(imdbId) data['release date'] = getMovieReleaseDate(imdbId) return data def getMovieInfo(imdbId, timeout=-1): data = readUrlUnicode(getUrlBase(imdbId), timeout=timeout) info = dict() info['poster'] = findRe(data, 'name="poster".*?(.*?):(.*?)
')[0] else: txt= i[1] txt = stripTags(txt).strip() def cleanUp(k): k = decodeHtml(k).replace(u'\xa0', ' ').strip() if k.endswith('more'): k=k[:-len('more')].strip() return k txt = cleanUp(txt) if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline', 'original air date'): if '|' in txt: txt = [cleanUp(k) for k in txt.split('|')] elif ', ' in txt: txt = [cleanUp(k) for k in txt.split(', ')] elif title in ('country', 'language', 'genre'): txt = [cleanUp(txt), ] if title == 'tv series': info['series_imdb'] = findRe(i[1], 'tt(\d{7})') if title == 'original air date': info['series_episode_info'] = txt.split('\n')[-1].strip() txt = txt.split('\n')[0].strip() if not title.startswith('moviemeter'): info[title] = txt for key in ('user comments', 'writers (wga)', 'plot keywords'): if key in info: del info[key] if 'release date' in info: if isinstance(info['release date'], list): info['release date'] = info['release date'][0] info['release date'] = info['release date'].split('\n')[0] if 'plot' in info: info['plot'] = info['plot'].split('| add synopsis')[0].strip() info['plot'] = info['plot'].split('| full synopsis')[0].strip() if info['plot'] in ('add synopsis', 'full synopsis'): info['plot'] = '' #get Title title = '' year = '' html_title = findRe(data, '
(.*?)
') if not html_title: html_title = findRe(data, '(.*?)') else: html_title = html_title.split('')[0] if html_title: html_title = html_title.replace('
', ' ').replace(' ', ' ') title = stripTags(html_title) title = decodeHtml(title) year = findRe(title, '\((\d{4})\)') if not year: year = findRe(title, '\((\d{4})') _y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))') if _y: title = title.replace(_y, '') for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') title = title.strip() if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')].strip() if title.startswith('"') and title.endswith('"'): title = title[1:-1] info['title'] = normalizeTitle(title) info['year'] = year #Series if title.startswith('"') and title.find('"',1) > 0 and \ title.find('"',1) == title.rfind('"'): episode_title = title[title.rfind('"')+1:] episode_title = re.sub("\?{4}", "", episode_title).strip() episode_title = re.sub("\d{4}", "", episode_title).strip() if episode_title == '-': episode_title='' title = normalizeTitle(title[1:title.rfind('"')]) if episode_title: info['episode title'] = episode_title info['series title'] = title info['title'] = "%s: %s" % (title, episode_title) else: info['title'] = title se = re.compile("Season (\d*), Episode (\d*)\)").findall(info.get('series_episode_info', '')) if se: info['season'] = int(se[0][0]) info['episode'] = int(se[0][1]) info['title'] = "%s (S%02dE%02d) %s" % ( info['series title'], info['season'], info['episode'], info['episode title']) info['title'] = info['title'].strip() del info['series_episode_info'] #Rating rating = findRe(data, '([\d\.]*?)/10') if rating: info['rating'] = float(rating) else: info['rating'] = -1 #Votes info['votes'] = -1 if "user rating" in info: if isinstance(info['user rating'], list): info['user rating'] = ' '.join(info['user rating']) votes = findRe(info['user rating'], '([\d,]*?) votes') if votes: info['votes'] = int(votes.replace(',', '')) return info def getMovieRuntimeSeconds(imdbId): info = getMovieInfo(imdbId) if 'runtime' in info: value = info['runtime'][0] parsed_value = findRe(value, '(.*?) min') parsed_value = findRe(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = findRe(value, '(.*?) sec') parsed_value = findRe(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: parsed_value = int(parsed_value) else: parsed_value = int(parsed_value) * 60 else: parsed_value = -1 return parsed_value def getMoviePoster(imdbId): info = getMovieInfo(imdbId) return info['poster'] def getMovieYear(imdbId): ''' >>> getMovieYear('0315404') u'1964' >>> getMovieYear('0734840') u'1990' >>> getMovieYear('0815352') u'1964' ''' info = getMovieInfo(imdbId) return info['year'] def getMovieTitle(imdbId): ''' >>> getMovieTitle('0306414') u'The Wire' >>> getMovieTitle('0734840') u'Twin Peaks (S01E02) Episode #1.2' >>> getMovieTitle('0734840') u'Twin Peaks (S01E02) Episode #1.2' >>> getMovieTitle('0749451') u'The Wire (S01E01) The Target' ''' info = getMovieInfo(imdbId) return info['title'] def getMovieAKATitles(imdbId): ''' >>> getMovieAKATitle('0040980') [(u'Frauen der Nacht', u'Germany'), (u'Les femmes de la nuit', u'France'), (u'Women of the Night', u'(undefined)')] ''' url = "%sreleaseinfo" % getUrlBase(imdbId) data = readUrlUnicode(url) titles = findRe(data, 'name="akas".*?(.*?)') titles = re.compile("td>(.*?)\n\n(.*)").findall(titles) return titles def creditList(data, section=None): if section == 'cast': credits_ = re.compile('''(.*?).*?(.*?)''').findall(data) else: credits_ = re.compile('''.*?(.*?)(.*?)''').findall(data) credits = [] for c_ in credits_: c = [stripTags(decodeHtml(c_[0]).strip()), stripTags(decodeHtml(c_[1]).strip())] if section=='writers': c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','') if c[1].endswith(' and'): c[1] = c[1][:-4] credits.append(c) return credits def getMovieCredits(imdbId): credits = dict() url = "%sfullcredits" % getUrlBase(imdbId) data = readUrlUnicode(url) groups = data.split('
') for g in groups: section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g) if section: credits[section[0]] = creditList(g, section[0]) return credits def getMovieTrailers(imdbId): url = "%strailers" % getUrlBase(imdbId) data = readUrlUnicode(url) soup = BeautifulSoup(data) videos = soup('div', {'class':"video-gallery"}) trailers = [] if videos: for a in videos[0]('a'): title = stripTags(unicode(a)).strip() url = 'http://www.imdb.com' + a['href'] videoId = findRe(url, '/(vi\d*?)/') iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId iframe = readUrlUnicode(iframeUrl) videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"')) trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) return trailers def getMovieQuotes(imdbId): url = "%squotes" % getUrlBase(imdbId) data = readUrlUnicode(url) quotes = re.compile('(.*?):(.*?)
', re.DOTALL).findall(findString(data, '(.*?)').split('

')[0] return plot.strip() def getMovieTechnical(imdbId): url = "%stechnical" % getUrlBase(imdbId) data = readUrlUnicode(url) results = {} for t in re.compile('
(.*?)
(.*?)
', re.DOTALL).findall(data): results[t[0].strip()] = t[1].strip() return results def getMovieCompanyCredits(imdbId): url = "%scompanycredits" % getUrlBase(imdbId) data = readUrlUnicode(url) results = {} for field, c in re.compile('

(.*?)

    (.*?)
').findall(data): results[field.strip()] = [] for company in re.compile('
  • (.*?)
  • ').findall(c): results[field.strip()].append(company) return results def getMovieLocations(imdbId): url = "%slocations" % getUrlBase(imdbId) data = readUrlUnicode(url) locations = re.compile('
    (.*?)') trivia = re.compile('
  • (.*?)
  • ', re.DOTALL).findall(data) def clean(t): t = decodeHtml(t) t = t.replace(u'”', '"') if t.endswith('

    '): t = t[:-8] if t.endswith('
    \n
    '): t = t[:-len('
    \n
    ')] return t.strip() trivia = [clean(t) for t in trivia] return trivia def getMovieConnections(imdbId): url = "%smovieconnections" % getUrlBase(imdbId) data = readUrlUnicode(url) connections={} for c in re.compile('''
    (.*?)
    (.*?)\n\n''', re.DOTALL).findall(data): connections[unicode(c[0])] = re.compile('''
    ''').findall(c[1]) return connections def getMovieKeywords(imdbId): url = "%skeywords" % getUrlBase(imdbId) data = readUrlUnicode(url) keywords = [] for keyword in re.compile('''(.*?)''').findall(data): keyword = decodeHtml(keyword) keyword = keyword.replace(u'\xa0', ' ') keywords.append(keyword) return keywords def getMovieExternalReviews(imdbId): url = "%sexternalreviews" % getUrlBase(imdbId) data = readUrlUnicode(url) data = findRe(data, '
      (.*?)
    ') _reviews = re.compile('
  • (.*?)
  • ').findall(data) reviews = {} for r in _reviews: reviews[r[0]] = r[1] return reviews def getMovieReleaseDate(imdbId): releasedates = getMovieReleaseDates(imdbId) first_release = None for r in releasedates: if not first_release or r[1] < first_release: first_release = r[1] return first_release def _parseDate(d): ''' >>>_parseDate('3 March 1972') '1972-03-03' ''' try: parsed_date = time.strptime(d, "%d %B %Y") parsed_date = '%s-%02d-%02d' % (parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday) return parsed_date except: try: parsed_date = time.strptime(d, "%B %Y") parsed_date = '%s-%02d-01' % (parsed_date.tm_year, parsed_date.tm_mon) return parsed_date except: pass try: parsed_date = time.strptime(d, "%Y") parsed_date = '%s-01-01' % (parsed_date.tm_year) return parsed_date except: pass return d def getMovieReleaseDates(imdbId): url = "%sreleaseinfo" % getUrlBase(imdbId) data = readUrlUnicode(url) releasedates = [] regexp = '''(.*?).*?(.*?).*?(.*?)''' for r in re.compile(regexp, re.DOTALL).findall(data): r_ = (stripTags(r[0]).strip(), _parseDate(stripTags(r[1]).strip()), decodeHtml(stripTags(r[2]).strip())) releasedates.append(r_) return releasedates def getMovieBusinessSum(imdbId): business = getMovieBusiness(imdbId) b_ = {'budget': 0, 'gross': 0, 'profit': 0} if 'budget' in business: #b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']]) budget = filter(lambda x: x.startswith('$'), business['budget']) if not budget: budget = business['budget'] b_['budget'] = int(intValue(budget[0].replace(',', ''))) if 'gross' in business: gross = filter(lambda x: x.startswith('$'), business['gross']) if gross: b_['gross'] = int(intValue(gross[0].replace(',', ''))) #b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']]) #if 'weekend gross' in business: # b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']]) if b_['budget'] and b_['gross']: b_['profit'] = b_['gross'] - b_['budget'] return b_ def getMovieFlimingDates(imdbId): business = getMovieBusiness(imdbId) if 'filming dates' in business and business['filming dates']: return business['filming dates'][0] return '' def getMovieBusiness(imdbId): url = "%sbusiness" % getUrlBase(imdbId) data = readUrlUnicode(url) business = {} for r in re.compile('''
    (.*?)
    (.*?)
    .
    ''', re.DOTALL).findall(data): key = stripTags(r[0]).strip().lower() value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('
    ')] business[key] = value return business def getMovieEpisodes(imdbId): url = "%sepisodes" % getUrlBase(imdbId) data = readUrlUnicode(url) episodes = {} regexp = r'''

    Season (.*?), Episode (.*?): (.*?)

    (.*?)
    (.*?)
    ''' for r in re.compile(regexp, re.DOTALL).findall(data): try: episode = "S%02dE%02d" % (int(r[0]), int(r[1])) episodes[episode] = {} episodes[episode]['imdb'] = r[2] episodes[episode]['title'] = r[3].strip() if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])): episodes[episode]['title'] = u'' description = decodeHtml(r[5]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description.strip() episodes[episode]['date'] = '' try: d = stripTags(r[4]) d = d.replace('Original Air Date: ', '') d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) episodes[episode]['date'] = d except: pass except: import traceback print traceback.print_exc() pass return episodes '''the old code below''' class IMDb: def __init__(self, imdbId): self.imdb = imdbId self.pageUrl = getUrlBase(imdbId) def getPage(self): return readUrlUnicode(self.pageUrl) def parse_raw_value(self, key, value): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): value = stripTags(value).strip() if key == 'runtime': parsed_value = getMovieRuntimeSeconds(self.imdb) elif key in ('country', 'language'): parsed_value = value.split(' / ') if len(parsed_value) == 1: parsed_value = parsed_value[0].split(' | ') parsed_value = [v.strip() for v in parsed_value] elif key == 'genre': parsed_value = value.replace('more', '').strip().split(' / ') if len(parsed_value) == 1: parsed_value = parsed_value[0].split(' | ') parsed_value = [v.strip() for v in parsed_value] elif key == 'tagline': parsed_value = value.replace('more', '').strip() elif key == 'plot_outline': parsed_value = value.replace('(view trailer)', '').strip() if parsed_value.endswith('more'): parsed_value = parsed_value[:-4].strip() elif key == 'tv_series': m = re.compile('(.*?)').findall(value) if m: parsed_value = m[0][0] else: parsed_value = '' elif key == 'also_known_as': parsed_value = '' m = re.compile('(.*) \(International: English title').findall(value) if m: parsed_value = m[0] else: m = re.compile('(.*) \(USA').findall(value) if m: parsed_value = m[0] parsed_value = parsed_value.split('
    ')[-1].split('(')[0] director = self.getCredits().get('director', None) if director: director = director[0] parsed_value = parsed_value.replace(director, '') if parsed_value.startswith("'s"): parsed_value = parsed_value[2:].strip() parsed_value = decodeHtml(parsed_value.strip()) else: print value parsed_value = value return parsed_value def parseYear(self): return getMovieYear(self.imdb) def parse(self): data = self.getPage() IMDbDict ={} info = getMovieInfo(self.imdb) #Poster IMDbDict['poster'] = getMoviePoster(self.imdb) if not IMDbDict['poster']: IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' #Title, Year IMDbDict['year'] = self.parseYear() IMDbDict['title'] = getMovieTitle(self.imdb) #Rating m = re.compile('(.*?)/10', re.IGNORECASE).search(data) if m: IMDbDict['rating'] = int(float(m.group(1)) * 1000) else: IMDbDict['rating'] = -1 #Votes IMDbDict['votes'] = info['votes'] data = data.replace('\n',' ') #some values keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') for key in keys: IMDbDict[key] = '' IMDbDict['runtime'] = 0 soup = BeautifulSoup(data) for info in soup('div', {'class': 'info'}): key = unicode(info).split('
    ')[0].split('
    ') if len(key) > 1: raw_value = unicode(info).split('
    ')[1] key = key[1][:-1].lower().replace(' ', '_') if key in keys: IMDbDict[key] = self.parse_raw_value(key, raw_value) IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) #is episode IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = getMovieEpisodes(self.imdb) if IMDbDict['episodes']: IMDbDict['tvshow'] = True else: IMDbDict['tvshow'] = False IMDbDict['credits'] = self.getCredits() IMDbDict['plot'] = getMoviePlot(self.imdb) IMDbDict['keywords'] = getMovieKeywords(self.imdb) IMDbDict['trivia'] = getMovieTrivia(self.imdb) IMDbDict['connections'] = getMovieConnections(self.imdb) IMDbDict['locations'] = getMovieLocations(self.imdb) IMDbDict['release_date'] = getMovieReleaseDate(self.imdb) IMDbDict['business'] = getMovieBusinessSum(self.imdb) IMDbDict['reviews'] = getMovieExternalReviews(self.imdb) IMDbDict['stills'] = getMovieStills(self.imdb) #IMDbDict['trailer'] = getMovieTrailer(self.imdb) self.IMDbDict = IMDbDict if IMDbDict['episode_of']: episode_of = getMovieInfo(IMDbDict['episode_of']) for key in ('country', 'language'): if not IMDbDict[key]: IMDbDict[key] = episode_of[key] return self.IMDbDict def getCredits(self): raw_credits = getMovieCredits(self.imdb) credits = {} def getNames(creditList): return [stripTags(decodeHtml(c[0])) for c in creditList] credits['director'] = getNames(raw_credits.get('directors', '')) credits['writer'] = getNames(raw_credits.get('writers', '')) credits['producer'] = getNames(raw_credits.get('producers', '')) credits['cinematographer'] = getNames(raw_credits.get('cinematographers', '')) credits['editor'] = getNames(raw_credits.get('editors', '')) credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])] self.credits = credits return self.credits def guess(title, director=''): #FIXME: proper file -> title title = title.split('-')[0] title = title.split('(')[0] title = title.split('.')[0] title = title.strip() imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) return_url = '' #lest first try google #i.e. site:imdb.com Michael Stevens Sin if director: search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(search, 2): if url.startswith('http://www.imdb.com/title/tt'): return normalizeImdbId(int(oxlib.intValue(url))) try: req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS) u = urllib2.urlopen(req) data = u.read() return_url = u.url u.close() except: return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?
    1. .*?(.*?)') filmo = data.split(u'

      Additional Details

      ')[0] movies = {} for part in filmo.split(u'
      (.*?):') section = decodeHtml(section) movies[section] = re.compile(u'href="/title/tt(\d{7})/"').findall(part) info['movies'] = movies return info if __name__ == '__main__': import sys #print parse(sys.argv[1]) print "imdb:", guess(sys.argv[1])