python-ox/ox/web/imdb.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import urllib2
import urllib
import re
import os
import time

import ox
from ox import findRe, stripTags
from ox.normalize import normalizeTitle, normalizeImdbId
import ox.cache
from ox.cache import readUrl

from siteparser import SiteParser
import google

def readUrl(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None):
    headers = headers.copy()
    return ox.cache.readUrl(url, data, headers, timeout)

def readUrlUnicode(url, timeout=ox.cache.cache_timeout):
   return ox.cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)

class Imdb(SiteParser):
    '''
    >>> Imdb('0068646')['title']
    u'The Godfather'

    >>> Imdb('0133093')['title']
    u'The Matrix'
    '''
    regex =  {
        'alternative_titles': {
            'page': 'releaseinfo',
            're': [
                'name="akas".*?<table.*?>(.*?)</table>',
                "td>(.*?)</td>.*?<td>(.*?)</td>"
            ],
            'type': 'list'
        
        },
        'cast': {
            'page': 'combined',
            're': [
                '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
                lambda ll: [stripTags(l) for l in ll]
             ],
            'type': 'list'
        },
        'cinematographers': {
            'page': 'combined',
            're': [
                lambda data: data.split('Series Crew')[0],
                'Cinematography by</a>(.*?)</table>',
                '<a href="/name/.*?/">(.*?)</a>'
            ],
            'type': 'list'
        },
        'connections': {
            'page': 'movieconnections',
            're': '<h5>(.*?)</h5>(.*?)\n\n',
            'type': 'list'
        },
        'countries': {
            'page': 'combined',
            're': [
                '<div class="info"><h5>Country:</h5>.*?<div class="info">',
                #'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
                '<a.*?>(.*?)</a>',
            ],
            'type': 'list'
        },
        'creators': {
            'page': 'combined',
            're': [
                '<h5>Creators:</h5>.*?<div class="info-content">(.*?)</div>',
                '<a href="/name/.*?>(.*?)</a>'
            ],
            'type': 'list'
        },
        'directors': {
            'page': 'combined',
            're': [
                lambda data: data.split('Series Crew')[0],
                'Directed by</a>(.*?)</table>',
                '<a href="/name/.*?>(.*?)</a>'
            ],
            'type': 'list'
        },
        'editors': {
            'page': 'combined',
            're': [
                lambda data: data.split('Series Crew')[0],
                'Film Editing by</a>(.*?)</table>',
                '<a href="/name/.*?>(.*?)</a>'
            ],
            'type': 'list'
        },
        'episode_title': {
            'page': 'combined',
            're': '<div id="tn15title">.*?<em>(.*?)</em>',
            'type': 'string'
        },
        'filming_locations': {
            'page': 'locations',
            're': '<a href="/search/title\?locations=.*?">(.*?)</a>',
            'type': 'list'
        },
        'genres': {
            'page': 'combined',
            're': '<a href="/Sections/Genres/.*?/">(.*?)</a>',
            'type': 'list'
        },
        'keywords': {
            'page': 'keywords',
            're': '<a href="/keyword/.*?/">(.*?)</a>',
            'type': 'list'
        },
        'languages': {
            'page': 'combined',
            're': [
                '<div class="info"><h5>Language:</h5>.*?<div class="info">',
                #'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
                '<a.*?>(.*?)</a>',
            ],
            'type': 'list'
        },
        'plot': {
            'page': 'plotsummary',
            're': '</div>.*?<p class="plotpar">(.*?)<i>',
            'type': 'string'
        },
        'poster_id': {
            'page': 'combined',
            're': '/primary-photo/media/rm(.*?)/tt',
            'type': 'string'
        },
        'poster_ids': {
            'page': 'posters',
            're': '/unknown-thumbnail/media/rm(.*?)/tt',
            'type': 'list'
        },
        'producers': {
            'page': 'combined',
            're': [
                lambda data: data.split('Series Crew')[0],
                'Produced by</a>(.*?)</table>',
                '<a href="/name/.*?/">(.*?)</a>'
            ],
            'type': 'list'
        },
        'rating': {
            'page': 'combined',
            're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
            'type': 'float'
        },
        'release date': {
            'page': 'releaseinfo',
            're': '<a href="/date/(\d{2})-(\d{2})/">.*?</a> <a href="/year/(\d{4})/">',
            'type': 'date'
        },
        'reviews': {
            'page': 'externalreviews',
            're': [
                '<ol>(.*?)</ol>',
                '<li><a href="(http.*?)".*?>(.*?)</a></li>'
            ],
            'type': 'list'
        },
        'runtime': {
            'page': 'combined',
            're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
            'type': 'string'
        },
        'season': {
            'page': 'combined',
            're': [
                '<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
                '\(Season (\d+), Episode \d+\)',
             ],
            'type': 'int'
        },
        'episode': {
            'page': 'combined',
            're': [
                '<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
                '\(Season \d+, Episode (\d+)\)',
             ],
            'type': 'int'
        },
        'series': {
            'page': 'combined',
            're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
            'type': 'string'
        },
        'original_title': {
            'page': 'combined',
            're': '<h1>(.*?) <span>',
            'type': 'string'
        },
        'trivia': {
            'page': 'trivia',
            're': '<div class="sodatext">(.*?)<br>',
            'type': 'list',
        },
        'votes': {
            'page': 'combined',
            're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
            'type': 'string'
        },
        'writers': {
            'page': 'combined',
            're': [
                lambda data: data.split('Series Crew')[0],
                'Writing credits</a>(.*?)</table>',
                '<a href="/name/.*?/">(.*?)</a>'
            ],
            'type': 'list'
        },
        'year': {
            'page': 'combined',
            're': '="og:title" content=".*?\((\d{4})\).*?"',
            'type': 'int'
        }
    }

    def readUrlUnicode(self, url, timeout):
        return readUrlUnicode(url, timeout)

    def __init__(self, id, timeout=-1):
        #use akas.imdb.com to always get original title:
        #http://www.imdb.com/help/show_leaf?titlelanguagedisplay
        self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
        super(Imdb, self).__init__(timeout)

        def is_international_title(t):
            if 'working title' in t[1].lower(): return False
            if 'complete title' in t[1].lower(): return False
            if t[1].lower() =='usa': return True
            if 'international' in t[1].lower(): return True
            return False
        ititle = filter(is_international_title, self.get('alternative_titles', []))
        if ititle:
            self['english_title'] = ititle[0][0]

        self['title'] = self.get('english_title', self['original_title'])

        if 'title' in self and self['title'].startswith('"') and self['title'].endswith('"'):
            self['title'] = self['title'][1:-1]
        if 'runtime' in self and self['runtime']:
            if 'min' in self['runtime']: base=60
            else: base=1
            self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
        if 'runtime' in self and not self['runtime']:
            del self['runtime']
        if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
        if 'connections' in self:
            cc={}
            if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring):
                self['connections'] = [self['connections']]
            for rel, data in self['connections']:
                cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)
            self['connections'] = cc

        for key in ('countries', 'genres'):
            if key in self:
                self[key] = filter(lambda x: x.lower() != 'home', self[key])

        if 'creators' in self:
            self['directors'] = self['creators']
            del self['creators']
        if 'series' in self:
            if 'episode_title' in self:
                self['series_title'] = self['title']
                self['title'] = "%s: %s" % (self['series_title'], self['episode_title'])
            if 'episode_title' in self and 'season' in self and 'episode' in self:
                self['title'] = "%s (S%02dE%02d) %s" % (
                        self['series_title'], self['season'], self['episode'], self['episode_title'])
            for key in ('directors', 'year'):
                if key in self:
                    self['episode_%s'%key] = self[key]
            series = Imdb(self['series'])
            for key in ['directors', 'year']:
                if key in series:
                    self[key] =series[key]
        else:
            for key in ('series_title', 'episode_title', 'season', 'episode'):
                if key in self:
                    del self[key]

class ImdbCombined(Imdb):
    def __init__(self, id, timeout=-1):
        _regex = {}
        for key in self.regex:
            if self.regex[key]['page'] == 'combined':
                _regex[key] = self.regex[key]
        self.regex = _regex
        super(ImdbCombined, self).__init__(id, timeout)

def getMovieId(title, director='', year='', timeout=-1):
    '''
    >>> getMovieId('The Matrix')
    u'0133093'

    >>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
    u'0060304'

    >>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
    u'0060304'

    >>> getMovieId(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
    u'0179214'

    >>> getMovieId(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
    u'0179214'
    '''
    params = {'s':'tt','q': title}
    if director:
        params['q'] = u'"%s" %s' % (title, director)
    if year:
        params['q'] = u'"%s (%s)" %s' % (title, year, director)
    params['q'] = params['q'].encode('utf-8')
    params = urllib.urlencode(params)
    url = "http://akas.imdb.com/find?" + params
    #print url

    data = readUrlUnicode(url, timeout=timeout)
    #if search results in redirect, get id of current page
    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
    results = re.compile(r).findall(data)    
    if results:
        return results[0]
    #otherwise get first result
    r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"'
    results = re.compile(r).findall(data)    
    if results:
        return results[0]
    #or nothing
    return ''

def getMoviePoster(imdbId):
    '''
    >>> getMoviePoster('0133093')
    'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'

    >>> getMoviePoster('0994352')
    'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
    '''
    info = ImdbCombined(imdbId)
    if 'poster_id' in info:
        url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['poster_id'], imdbId)
        data = readUrl(url)
        poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
        return poster
    elif 'series' in info:
        return getMoviePoster(info['series'])
    return ''

def guess(title, director='', timeout=-1):
    return getMovieId(title, director, timeout=timeout)

if __name__ == "__main__":
    import json
    print json.dumps(Imdb('0306414'), indent=2)
    #print json.dumps(Imdb('0133093'), indent=2)
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`import urllib2`
use imdb for search 2010-12-31 07:23:28 +00:00			`import urllib`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`import re`
			`import os`
			`import time`

			`import ox`
more imdb refinement 2010-07-10 08:24:56 +00:00			`from ox import findRe, stripTags`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`from ox.normalize import normalizeTitle, normalizeImdbId`
use cookie to get us titles from imdb 2010-10-08 16:07:39 +00:00			`import ox.cache`
poster urls 2010-07-19 10:05:01 +00:00			`from ox.cache import readUrl`
add ox.web to this repos 2010-07-07 23:25:57 +00:00
			`from siteparser import SiteParser`
			`import google`

use cookie to get us titles from imdb 2010-10-08 16:07:39 +00:00			`def readUrl(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None):`
			`headers = headers.copy()`
			`return ox.cache.readUrl(url, data, headers, timeout)`

			`def readUrlUnicode(url, timeout=ox.cache.cache_timeout):`
			`return ox.cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)`
more imdb refinement 2010-07-10 08:24:56 +00:00
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`class Imdb(SiteParser):`
use akas.imdb.com 2010-12-09 03:37:28 +00:00			`'''`
			`>>> Imdb('0068646')['title']`
			`u'The Godfather'`

			`>>> Imdb('0133093')['title']`
			`u'The Matrix'`
			`'''`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`regex = {`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00			`'alternative_titles': {`
			`'page': 'releaseinfo',`
			`'re': [`
			`'name="akas".?<table.?>(.*?)</table>',`
use aka titles 2010-10-08 15:43:25 +00:00			`"td>(.?)</td>.?<td>(.*?)</td>"`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00			`],`
			`'type': 'list'`

			`},`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'cast': {`
dates, reduce number of imdb pages loaded 2010-07-10 11:54:33 +00:00			`'page': 'combined',`
more imdb refinement 2010-07-10 08:24:56 +00:00			`'re': [`
			`'<td class="nm">.?>(.?)</a>.?<td class="char">(.?)</td>',`
			`lambda ll: [stripTags(l) for l in ll]`
			`],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'list'`
			`},`
			`'cinematographers': {`
dates, reduce number of imdb pages loaded 2010-07-10 11:54:33 +00:00			`'page': 'combined',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'re': [`
more imdb refinement 2010-07-10 08:24:56 +00:00			`lambda data: data.split('Series Crew')[0],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'Cinematography by</a>(.*?)</table>',`
			`'<a href="/name/.?/">(.?)</a>'`
			`],`
			`'type': 'list'`
			`},`
			`'connections': {`
			`'page': 'movieconnections',`
			`'re': '<h5>(.?)</h5>(.?)\n\n',`
			`'type': 'list'`
			`},`
			`'countries': {`
			`'page': 'combined',`
country links 2010-12-23 06:00:53 +00:00			`'re': [`
			`'<div class="info"><h5>Country:</h5>.*?<div class="info">',`
			`#'<a href="/country/.?">(.?)</a>', #links changed to work with existing caches, just take all links`
			`'<a.?>(.?)</a>',`
			`],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'list'`
			`},`
series creators as directors 2010-11-28 15:53:47 +00:00			`'creators': {`
			`'page': 'combined',`
			`'re': [`
			`'<h5>Creators:</h5>.?<div class="info-content">(.?)</div>',`
			`'<a href="/name/.?>(.?)</a>'`
			`],`
			`'type': 'list'`
			`},`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'directors': {`
dates, reduce number of imdb pages loaded 2010-07-10 11:54:33 +00:00			`'page': 'combined',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'re': [`
more imdb refinement 2010-07-10 08:24:56 +00:00			`lambda data: data.split('Series Crew')[0],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'Directed by</a>(.*?)</table>',`
series creators as directors 2010-11-28 15:53:47 +00:00			`'<a href="/name/.?>(.?)</a>'`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`],`
			`'type': 'list'`
			`},`
			`'editors': {`
dates, reduce number of imdb pages loaded 2010-07-10 11:54:33 +00:00			`'page': 'combined',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'re': [`
more imdb refinement 2010-07-10 08:24:56 +00:00			`lambda data: data.split('Series Crew')[0],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'Film Editing by</a>(.*?)</table>',`
series creators as directors 2010-11-28 15:53:47 +00:00			`'<a href="/name/.?>(.?)</a>'`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`],`
			`'type': 'list'`
			`},`
seasons 2010-07-12 08:52:26 +00:00			`'episode_title': {`
			`'page': 'combined',`
			`'re': '<div id="tn15title">.?<em>(.?)</em>',`
			`'type': 'string'`
			`},`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'filming_locations': {`
			`'page': 'locations',`
			`'re': '<a href="/search/title\?locations=.?">(.?)</a>',`
			`'type': 'list'`
			`},`
			`'genres': {`
			`'page': 'combined',`
			`'re': '<a href="/Sections/Genres/.?/">(.?)</a>',`
			`'type': 'list'`
			`},`
			`'keywords': {`
			`'page': 'keywords',`
			`'re': '<a href="/keyword/.?/">(.?)</a>',`
			`'type': 'list'`
			`},`
			`'languages': {`
			`'page': 'combined',`
language too 2010-12-23 06:05:06 +00:00			`'re': [`
			`'<div class="info"><h5>Language:</h5>.*?<div class="info">',`
			`#'<a href="/language/.?">(.?)</a>', #links changed to work with existing caches, just take all links`
			`'<a.?>(.?)</a>',`
			`],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'list'`
			`},`
			`'plot': {`
			`'page': 'plotsummary',`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00			`'re': '</div>.?<p class="plotpar">(.?)<i>',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'string'`
			`},`
			`'poster_id': {`
			`'page': 'combined',`
			`'re': '/primary-photo/media/rm(.*?)/tt',`
poster urls 2010-07-19 10:05:01 +00:00			`'type': 'string'`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`},`
			`'poster_ids': {`
			`'page': 'posters',`
			`'re': '/unknown-thumbnail/media/rm(.*?)/tt',`
			`'type': 'list'`
			`},`
			`'producers': {`
			`'page': 'combined',`
			`'re': [`
producers need series crew cropping too 2010-07-10 18:20:48 +00:00			`lambda data: data.split('Series Crew')[0],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'Produced by</a>(.*?)</table>',`
			`'<a href="/name/.?/">(.?)</a>'`
			`],`
			`'type': 'list'`
			`},`
			`'rating': {`
			`'page': 'combined',`
fix rating 2011-01-06 10:49:58 +00:00			`'re': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'float'`
			`},`
seasons 2010-07-12 08:52:26 +00:00			`'release date': {`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'page': 'releaseinfo',`
			`'re': '<a href="/date/(\d{2})-(\d{2})/">.*?</a> <a href="/year/(\d{4})/">',`
			`'type': 'date'`
			`},`
add reviews 2010-07-08 08:59:15 +00:00			`'reviews': {`
			`'page': 'externalreviews',`
			`'re': [`
			`'<ol>(.*?)</ol>',`
			`'<li><a href="(http.?)".?>(.*?)</a></li>'`
			`],`
			`'type': 'list'`
			`},`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'runtime': {`
			`'page': 'combined',`
			`'re': '<h5>Runtime:</h5><div class="info-content">.?([0-9]+ sec\|[0-9]+ min).?</div>',`
			`'type': 'string'`
			`},`
seasons 2010-07-12 08:52:26 +00:00			`'season': {`
			`'page': 'combined',`
only take season/episode from original air date 2010-07-13 09:28:55 +00:00			`'re': [`
			`'<h5>Original Air Date:</h5>.?<div class="info-content">(.?)</div>',`
			`'\(Season (\d+), Episode \d+\)',`
			`],`
seasons 2010-07-12 08:52:26 +00:00			`'type': 'int'`
			`},`
			`'episode': {`
			`'page': 'combined',`
only take season/episode from original air date 2010-07-13 09:28:55 +00:00			`'re': [`
			`'<h5>Original Air Date:</h5>.?<div class="info-content">(.?)</div>',`
			`'\(Season \d+, Episode (\d+)\)',`
			`],`
seasons 2010-07-12 08:52:26 +00:00			`'type': 'int'`
			`},`
			`'series': {`
			`'page': 'combined',`
			`'re': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',`
			`'type': 'string'`
			`},`
use akas.imdb.com 2010-12-09 03:37:28 +00:00			`'original_title': {`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'page': 'combined',`
			`'re': '<h1>(.*?) <span>',`
			`'type': 'string'`
			`},`
			`'trivia': {`
			`'page': 'trivia',`
			`'re': '<div class="sodatext">(.*?)<br>',`
			`'type': 'list',`
			`},`
			`'votes': {`
			`'page': 'combined',`
more imdb refinement 2010-07-10 08:24:56 +00:00			`'re': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'string'`
			`},`
			`'writers': {`
dates, reduce number of imdb pages loaded 2010-07-10 11:54:33 +00:00			`'page': 'combined',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'re': [`
more imdb refinement 2010-07-10 08:24:56 +00:00			`lambda data: data.split('Series Crew')[0],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'Writing credits</a>(.*?)</table>',`
			`'<a href="/name/.?/">(.?)</a>'`
			`],`
			`'type': 'list'`
			`},`
			`'year': {`
			`'page': 'combined',`
name->property 2010-12-07 18:29:53 +00:00			`'re': '="og:title" content=".?\((\d{4})\).?"',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'int'`
			`}`
			`}`

use cookie to get us titles from imdb 2010-10-08 16:07:39 +00:00			`def readUrlUnicode(self, url, timeout):`
			`return readUrlUnicode(url, timeout)`

seasons 2010-07-12 08:52:26 +00:00			`def __init__(self, id, timeout=-1):`
use akas.imdb.com 2010-12-09 03:37:28 +00:00			`#use akas.imdb.com to always get original title:`
			`#http://www.imdb.com/help/show_leaf?titlelanguagedisplay`
			`self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id`
seasons 2010-07-12 08:52:26 +00:00			`super(Imdb, self).__init__(timeout)`
add ox.web to this repos 2010-07-07 23:25:57 +00:00
use akas.imdb.com 2010-12-09 03:37:28 +00:00			`def is_international_title(t):`
no working title 2010-12-25 11:22:50 +00:00			`if 'working title' in t[1].lower(): return False`
ignore complete title 2010-12-28 17:00:45 +00:00			`if 'complete title' in t[1].lower(): return False`
promotional abbreviation 2011-02-21 10:13:40 +00:00			`if t[1].lower() =='usa': return True`
use akas.imdb.com 2010-12-09 03:37:28 +00:00			`if 'international' in t[1].lower(): return True`
			`return False`
alternative title 2010-12-23 09:09:22 +00:00			`ititle = filter(is_international_title, self.get('alternative_titles', []))`
use akas.imdb.com 2010-12-09 03:37:28 +00:00			`if ititle:`
			`self['english_title'] = ititle[0][0]`

			`self['title'] = self.get('english_title', self['original_title'])`

seasons 2010-07-12 08:52:26 +00:00			`if 'title' in self and self['title'].startswith('"') and self['title'].endswith('"'):`
			`self['title'] = self['title'][1:-1]`
imdb parser fixes 2010-07-08 08:03:57 +00:00			`if 'runtime' in self and self['runtime']:`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`if 'min' in self['runtime']: base=60`
			`else: base=1`
			`self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base`
more imdb refinement 2010-07-10 08:24:56 +00:00			`if 'runtime' in self and not self['runtime']:`
			`del self['runtime']`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00			`if 'votes' in self: self['votes'] = self['votes'].replace(',', '')`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`if 'connections' in self:`
			`cc={}`
imdb parser fixes 2010-07-08 08:03:57 +00:00			`if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring):`
			`self['connections'] = [self['connections']]`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`for rel, data in self['connections']:`
			`cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)`
			`self['connections'] = cc`

more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00			`for key in ('countries', 'genres'):`
more imdb refinement 2010-07-10 08:24:56 +00:00			`if key in self:`
			`self[key] = filter(lambda x: x.lower() != 'home', self[key])`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00
series creators as directors 2010-11-28 15:53:47 +00:00			`if 'creators' in self:`
			`self['directors'] = self['creators']`
			`del self['creators']`
seasons 2010-07-12 08:52:26 +00:00			`if 'series' in self:`
			`if 'episode_title' in self:`
			`self['series_title'] = self['title']`
			`self['title'] = "%s: %s" % (self['series_title'], self['episode_title'])`
			`if 'episode_title' in self and 'season' in self and 'episode' in self:`
			`self['title'] = "%s (S%02dE%02d) %s" % (`
			`self['series_title'], self['season'], self['episode'], self['episode_title'])`
series creators as directors 2010-11-28 15:53:47 +00:00			`for key in ('directors', 'year'):`
name->property 2010-12-07 18:29:53 +00:00			`if key in self:`
			`self['episode_%s'%key] = self[key]`
series creators as directors 2010-11-28 15:53:47 +00:00			`series = Imdb(self['series'])`
			`for key in ['directors', 'year']:`
name->property 2010-12-07 18:29:53 +00:00			`if key in series:`
			`self[key] =series[key]`
seasons 2010-07-12 08:52:26 +00:00			`else:`
			`for key in ('series_title', 'episode_title', 'season', 'episode'):`
			`if key in self:`
			`del self[key]`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00
somtimes its better to just make one imdb request 2010-07-18 18:24:36 +00:00			`class ImdbCombined(Imdb):`
			`def __init__(self, id, timeout=-1):`
			`_regex = {}`
			`for key in self.regex:`
use cookie to get us titles from imdb 2010-10-08 16:07:39 +00:00			`if self.regex[key]['page'] == 'combined':`
somtimes its better to just make one imdb request 2010-07-18 18:24:36 +00:00			`_regex[key] = self.regex[key]`
			`self.regex = _regex`
			`super(ImdbCombined, self).__init__(id, timeout)`

use imdb for search 2010-12-31 07:23:28 +00:00			`def getMovieId(title, director='', year='', timeout=-1):`
fix criterion 2010-07-18 18:57:22 +00:00			`'''`
			`>>> getMovieId('The Matrix')`
make sure tests work again, fix to32 2010-09-03 21:19:19 +00:00			`u'0133093'`

			`>>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard')`
			`u'0060304'`

			`>>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')`
			`u'0060304'`
use imdb for search 2010-12-31 07:23:28 +00:00
update tests 2011-02-08 07:13:23 +00:00			`>>> getMovieId(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')`
use imdb for search 2010-12-31 07:23:28 +00:00			`u'0179214'`

			`>>> getMovieId(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')`
			`u'0179214'`
fix criterion 2010-07-18 18:57:22 +00:00			`'''`
use imdb for search 2010-12-31 07:23:28 +00:00			`params = {'s':'tt','q': title}`
fix criterion 2010-07-18 18:57:22 +00:00			`if director:`
unicode 2011-02-08 07:20:57 +00:00			`params['q'] = u'"%s" %s' % (title, director)`
make sure tests work again, fix to32 2010-09-03 21:19:19 +00:00			`if year:`
unicode 2011-02-08 07:20:57 +00:00			`params['q'] = u'"%s (%s)" %s' % (title, year, director)`
			`params['q'] = params['q'].encode('utf-8')`
use imdb for search 2010-12-31 07:23:28 +00:00			`params = urllib.urlencode(params)`
			`url = "http://akas.imdb.com/find?" + params`
			`#print url`

			`data = readUrlUnicode(url, timeout=timeout)`
			`#if search results in redirect, get id of current page`
			`r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'`
			`results = re.compile(r).findall(data)`
			`if results:`
			`return results[0]`
			`#otherwise get first result`
			`r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"'`
			`results = re.compile(r).findall(data)`
			`if results:`
			`return results[0]`
			`#or nothing`
fix criterion 2010-07-18 18:57:22 +00:00			`return ''`

poster urls 2010-07-19 10:05:01 +00:00			`def getMoviePoster(imdbId):`
get series poster for episodes 2010-09-17 08:46:37 +00:00			`'''`
			`>>> getMoviePoster('0133093')`
			`'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'`

			`>>> getMoviePoster('0994352')`
			`'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'`
			`'''`
poster urls 2010-07-19 10:05:01 +00:00			`info = ImdbCombined(imdbId)`
			`if 'poster_id' in info:`
			`url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['poster_id'], imdbId)`
			`data = readUrl(url)`
			`poster = findRe(data, 'img id="primary-img".?src="(.?)"')`
			`return poster`
get series poster for episodes 2010-09-17 08:46:37 +00:00			`elif 'series' in info:`
			`return getMoviePoster(info['series'])`
poster urls 2010-07-19 10:05:01 +00:00			`return ''`

use imdb for search 2010-12-31 07:23:28 +00:00			`def guess(title, director='', timeout=-1):`
			`return getMovieId(title, director, timeout=timeout)`
add ox.web to this repos 2010-07-07 23:25:57 +00:00
			`if __name__ == "__main__":`
			`import json`
			`print json.dumps(Imdb('0306414'), indent=2)`
			`#print json.dumps(Imdb('0133093'), indent=2)`