python-ox/ox/web/imdb.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import urllib2
from urllib import quote, unquote
import re
import os
import time

import ox
from ox import findRe, stripTags
from ox.normalize import normalizeTitle, normalizeImdbId
from ox.cache import readUrl

from siteparser import SiteParser
import google


class Imdb(SiteParser):
    regex =  {
        'alternative_titles': {
            'page': 'releaseinfo',
            're': [
                'name="akas".*?<table.*?>(.*?)</table>',
                "td>(.*?)</td>\n\n<td>(.*?)</td>"
            ],
            'type': 'list'
        
        },
        'cast': {
            'page': 'combined',
            're': [
                '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
                lambda ll: [stripTags(l) for l in ll]
             ],
            'type': 'list'
        },
        'cinematographers': {
            'page': 'combined',
            're': [
                lambda data: data.split('Series Crew')[0],
                'Cinematography by</a>(.*?)</table>',
                '<a href="/name/.*?/">(.*?)</a>'
            ],
            'type': 'list'
        },
        'connections': {
            'page': 'movieconnections',
            're': '<h5>(.*?)</h5>(.*?)\n\n',
            'type': 'list'
        },
        'countries': {
            'page': 'combined',
            're': '<a href="/Sections/Countries/.*?/">(.*?)</a>',
            'type': 'list'
        },
        'directors': {
            'page': 'combined',
            're': [
                lambda data: data.split('Series Crew')[0],
                'Directed by</a>(.*?)</table>',
                '<a href="/name/.*?/">(.*?)</a>'
            ],
            'type': 'list'
        },
        'editors': {
            'page': 'combined',
            're': [
                lambda data: data.split('Series Crew')[0],
                'Film Editing by</a>(.*?)</table>',
                '<a href="/name/.*?/">(.*?)</a>'
            ],
            'type': 'list'
        },
        'episode_title': {
            'page': 'combined',
            're': '<div id="tn15title">.*?<em>(.*?)</em>',
            'type': 'string'
        },
        'filming_locations': {
            'page': 'locations',
            're': '<a href="/search/title\?locations=.*?">(.*?)</a>',
            'type': 'list'
        },
        'genres': {
            'page': 'combined',
            're': '<a href="/Sections/Genres/.*?/">(.*?)</a>',
            'type': 'list'
        },
        'keywords': {
            'page': 'keywords',
            're': '<a href="/keyword/.*?/">(.*?)</a>',
            'type': 'list'
        },
        'languages': {
            'page': 'combined',
            're': '<a href="/Sections/Languages/.*?/">(.*?)</a>',
            'type': 'list'
        },
        'original_title': {
            'page': 'combined',
            're': '<span class="title-extra">(.*?) <i>(original title)</i></span>',
            'type': 'string'
        },
        'plot': {
            'page': 'plotsummary',
            're': '</div>.*?<p class="plotpar">(.*?)<i>',
            'type': 'string'
        },
        'poster_id': {
            'page': 'combined',
            're': '/primary-photo/media/rm(.*?)/tt',
            'type': 'string'
        },
        'poster_ids': {
            'page': 'posters',
            're': '/unknown-thumbnail/media/rm(.*?)/tt',
            'type': 'list'
        },
        'producers': {
            'page': 'combined',
            're': [
                lambda data: data.split('Series Crew')[0],
                'Produced by</a>(.*?)</table>',
                '<a href="/name/.*?/">(.*?)</a>'
            ],
            'type': 'list'
        },
        'rating': {
            'page': 'combined',
            're': '<div class="starbar-meta">.*?<b>([\d,.]?)/10</b>',
            'type': 'float'
        },
        'release date': {
            'page': 'releaseinfo',
            're': '<a href="/date/(\d{2})-(\d{2})/">.*?</a> <a href="/year/(\d{4})/">',
            'type': 'date'
        },
        'reviews': {
            'page': 'externalreviews',
            're': [
                '<ol>(.*?)</ol>',
                '<li><a href="(http.*?)".*?>(.*?)</a></li>'
            ],
            'type': 'list'
        },
        'runtime': {
            'page': 'combined',
            're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
            'type': 'string'
        },
        'season': {
            'page': 'combined',
            're': [
                '<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
                '\(Season (\d+), Episode \d+\)',
             ],
            'type': 'int'
        },
        'episode': {
            'page': 'combined',
            're': [
                '<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
                '\(Season \d+, Episode (\d+)\)',
             ],
            'type': 'int'
        },
        'series': {
            'page': 'combined',
            're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
            'type': 'string'
        },
        'title': {
            'page': 'combined',
            're': '<h1>(.*?) <span>',
            'type': 'string'
        },
        'trivia': {
            'page': 'trivia',
            're': '<div class="sodatext">(.*?)<br>',
            'type': 'list',
        },
        'votes': {
            'page': 'combined',
            're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
            'type': 'string'
        },
        'writers': {
            'page': 'combined',
            're': [
                lambda data: data.split('Series Crew')[0],
                'Writing credits</a>(.*?)</table>',
                '<a href="/name/.*?/">(.*?)</a>'
            ],
            'type': 'list'
        },
        'year': {
            'page': 'combined',
            're': '<meta name="og:title" content=".*?\((\d{4})\).*?"',
            'type': 'int'
        }
    }

    def __init__(self, id, timeout=-1):
        self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
        super(Imdb, self).__init__(timeout)

        if 'title' in self and self['title'].startswith('"') and self['title'].endswith('"'):
            self['title'] = self['title'][1:-1]
        if 'runtime' in self and self['runtime']:
            if 'min' in self['runtime']: base=60
            else: base=1
            self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
        if 'runtime' in self and not self['runtime']:
            del self['runtime']
        if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
        if 'connections' in self:
            cc={}
            if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring):
                self['connections'] = [self['connections']]
            for rel, data in self['connections']:
                cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)
            self['connections'] = cc

        for key in ('countries', 'genres'):
            if key in self:
                self[key] = filter(lambda x: x.lower() != 'home', self[key])

        if 'series' in self:
            if 'episode_title' in self:
                self['series_title'] = self['title']
                self['title'] = "%s: %s" % (self['series_title'], self['episode_title'])
            if 'episode_title' in self and 'season' in self and 'episode' in self:
                self['title'] = "%s (S%02dE%02d) %s" % (
                        self['series_title'], self['season'], self['episode'], self['episode_title'])
        else:
            for key in ('series_title', 'episode_title', 'season', 'episode'):
                if key in self:
                    del self[key]

class ImdbCombined(Imdb):
    def __init__(self, id, timeout=-1):
        _regex = {}
        for key in self.regex:
            if self.regex[key]['page'] == 'combined':
                _regex[key] = self.regex[key]
        self.regex = _regex
        super(ImdbCombined, self).__init__(id, timeout)

def getMovieId(title, director='', year=''):
    '''
    >>> getMovieId('The Matrix')
    '0133093'
    '''
    if year:
        title = "%s (%s)" % (title, year)
    if director:
        query = 'site:imdb.com %s "%s"' % (director, title)
    else:
        query = 'site:imdb.com "%s"' % title
    for (name, url, desc) in google.find(query, 5, timeout=-1):
        if url.startswith('http://www.imdb.com/title/tt'):
            return url[28:35]
    return ''

def getMoviePoster(imdbId):
    info = ImdbCombined(imdbId)
    if 'poster_id' in info:
        url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['poster_id'], imdbId)
        data = readUrl(url)
        poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
        return poster
    return ''

def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
    #FIXME: proper file -> title
    title = title.split('-')[0]
    title = title.split('(')[0]
    title = title.split('.')[0]
    title = title.strip()
    imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
    return_url = ''

    #lest first try google
    #i.e. site:imdb.com Michael Stevens "Sin"
    if director:
        search = 'site:imdb.com %s "%s"' % (director, title)
    else:
        search = 'site:imdb.com "%s"' % title
    for (name, url, desc) in google.find(search, 2, timeout=timeout):
        if url.startswith('http://www.imdb.com/title/tt'):
             return normalizeImdbId(int(ox.intValue(url)))

    try:
        req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
        u = urllib2.urlopen(req)
        data = u.read()
        return_url = u.url
        u.close()
    except:
        return None
    if return_url.startswith('http://www.imdb.com/title/tt'):
        return return_url[28:35]
    if data:
        imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
        if imdb_id:
            return imdb_id

    imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
    req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
    u = urllib2.urlopen(req)
    data = u.read()
    return_url = u.url
    u.close()
    if return_url.startswith('http://www.imdb.com/title/tt'):
        return return_url[28:35]

    return None


if __name__ == "__main__":
    import json
    print json.dumps(Imdb('0306414'), indent=2)
    #print json.dumps(Imdb('0133093'), indent=2)
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`import urllib2`
			`from urllib import quote, unquote`
			`import re`
			`import os`
			`import time`

			`import ox`
more imdb refinement 2010-07-10 08:24:56 +00:00			`from ox import findRe, stripTags`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`from ox.normalize import normalizeTitle, normalizeImdbId`
poster urls 2010-07-19 10:05:01 +00:00			`from ox.cache import readUrl`
add ox.web to this repos 2010-07-07 23:25:57 +00:00
			`from siteparser import SiteParser`
			`import google`

more imdb refinement 2010-07-10 08:24:56 +00:00
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`class Imdb(SiteParser):`
			`regex = {`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00			`'alternative_titles': {`
			`'page': 'releaseinfo',`
			`'re': [`
			`'name="akas".?<table.?>(.*?)</table>',`
			`"td>(.?)</td>\n\n<td>(.?)</td>"`
			`],`
			`'type': 'list'`

			`},`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'cast': {`
dates, reduce number of imdb pages loaded 2010-07-10 11:54:33 +00:00			`'page': 'combined',`
more imdb refinement 2010-07-10 08:24:56 +00:00			`'re': [`
			`'<td class="nm">.?>(.?)</a>.?<td class="char">(.?)</td>',`
			`lambda ll: [stripTags(l) for l in ll]`
			`],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'list'`
			`},`
			`'cinematographers': {`
dates, reduce number of imdb pages loaded 2010-07-10 11:54:33 +00:00			`'page': 'combined',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'re': [`
more imdb refinement 2010-07-10 08:24:56 +00:00			`lambda data: data.split('Series Crew')[0],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'Cinematography by</a>(.*?)</table>',`
			`'<a href="/name/.?/">(.?)</a>'`
			`],`
			`'type': 'list'`
			`},`
			`'connections': {`
			`'page': 'movieconnections',`
			`'re': '<h5>(.?)</h5>(.?)\n\n',`
			`'type': 'list'`
			`},`
			`'countries': {`
			`'page': 'combined',`
			`'re': '<a href="/Sections/Countries/.?/">(.?)</a>',`
			`'type': 'list'`
			`},`
			`'directors': {`
dates, reduce number of imdb pages loaded 2010-07-10 11:54:33 +00:00			`'page': 'combined',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'re': [`
more imdb refinement 2010-07-10 08:24:56 +00:00			`lambda data: data.split('Series Crew')[0],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'Directed by</a>(.*?)</table>',`
			`'<a href="/name/.?/">(.?)</a>'`
			`],`
			`'type': 'list'`
			`},`
			`'editors': {`
dates, reduce number of imdb pages loaded 2010-07-10 11:54:33 +00:00			`'page': 'combined',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'re': [`
more imdb refinement 2010-07-10 08:24:56 +00:00			`lambda data: data.split('Series Crew')[0],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'Film Editing by</a>(.*?)</table>',`
			`'<a href="/name/.?/">(.?)</a>'`
			`],`
			`'type': 'list'`
			`},`
seasons 2010-07-12 08:52:26 +00:00			`'episode_title': {`
			`'page': 'combined',`
			`'re': '<div id="tn15title">.?<em>(.?)</em>',`
			`'type': 'string'`
			`},`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'filming_locations': {`
			`'page': 'locations',`
			`'re': '<a href="/search/title\?locations=.?">(.?)</a>',`
			`'type': 'list'`
			`},`
			`'genres': {`
			`'page': 'combined',`
			`'re': '<a href="/Sections/Genres/.?/">(.?)</a>',`
			`'type': 'list'`
			`},`
			`'keywords': {`
			`'page': 'keywords',`
			`'re': '<a href="/keyword/.?/">(.?)</a>',`
			`'type': 'list'`
			`},`
			`'languages': {`
			`'page': 'combined',`
			`'re': '<a href="/Sections/Languages/.?/">(.?)</a>',`
			`'type': 'list'`
			`},`
add original_title 2010-07-12 09:04:34 +00:00			`'original_title': {`
			`'page': 'combined',`
			`'re': '<span class="title-extra">(.*?) <i>(original title)</i></span>',`
			`'type': 'string'`
			`},`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'plot': {`
			`'page': 'plotsummary',`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00			`'re': '</div>.?<p class="plotpar">(.?)<i>',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'string'`
			`},`
			`'poster_id': {`
			`'page': 'combined',`
			`'re': '/primary-photo/media/rm(.*?)/tt',`
poster urls 2010-07-19 10:05:01 +00:00			`'type': 'string'`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`},`
			`'poster_ids': {`
			`'page': 'posters',`
			`'re': '/unknown-thumbnail/media/rm(.*?)/tt',`
			`'type': 'list'`
			`},`
			`'producers': {`
			`'page': 'combined',`
			`'re': [`
producers need series crew cropping too 2010-07-10 18:20:48 +00:00			`lambda data: data.split('Series Crew')[0],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'Produced by</a>(.*?)</table>',`
			`'<a href="/name/.?/">(.?)</a>'`
			`],`
			`'type': 'list'`
			`},`
			`'rating': {`
			`'page': 'combined',`
more imdb refinement 2010-07-10 08:24:56 +00:00			`'re': '<div class="starbar-meta">.*?<b>([\d,.]?)/10</b>',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'float'`
			`},`
seasons 2010-07-12 08:52:26 +00:00			`'release date': {`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'page': 'releaseinfo',`
			`'re': '<a href="/date/(\d{2})-(\d{2})/">.*?</a> <a href="/year/(\d{4})/">',`
			`'type': 'date'`
			`},`
add reviews 2010-07-08 08:59:15 +00:00			`'reviews': {`
			`'page': 'externalreviews',`
			`'re': [`
			`'<ol>(.*?)</ol>',`
			`'<li><a href="(http.?)".?>(.*?)</a></li>'`
			`],`
			`'type': 'list'`
			`},`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'runtime': {`
			`'page': 'combined',`
			`'re': '<h5>Runtime:</h5><div class="info-content">.?([0-9]+ sec\|[0-9]+ min).?</div>',`
			`'type': 'string'`
			`},`
seasons 2010-07-12 08:52:26 +00:00			`'season': {`
			`'page': 'combined',`
only take season/episode from original air date 2010-07-13 09:28:55 +00:00			`'re': [`
			`'<h5>Original Air Date:</h5>.?<div class="info-content">(.?)</div>',`
			`'\(Season (\d+), Episode \d+\)',`
			`],`
seasons 2010-07-12 08:52:26 +00:00			`'type': 'int'`
			`},`
			`'episode': {`
			`'page': 'combined',`
only take season/episode from original air date 2010-07-13 09:28:55 +00:00			`'re': [`
			`'<h5>Original Air Date:</h5>.?<div class="info-content">(.?)</div>',`
			`'\(Season \d+, Episode (\d+)\)',`
			`],`
seasons 2010-07-12 08:52:26 +00:00			`'type': 'int'`
			`},`
			`'series': {`
			`'page': 'combined',`
			`'re': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',`
			`'type': 'string'`
			`},`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'title': {`
			`'page': 'combined',`
			`'re': '<h1>(.*?) <span>',`
			`'type': 'string'`
			`},`
			`'trivia': {`
			`'page': 'trivia',`
			`'re': '<div class="sodatext">(.*?)<br>',`
			`'type': 'list',`
			`},`
			`'votes': {`
			`'page': 'combined',`
more imdb refinement 2010-07-10 08:24:56 +00:00			`'re': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'string'`
			`},`
			`'writers': {`
dates, reduce number of imdb pages loaded 2010-07-10 11:54:33 +00:00			`'page': 'combined',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'re': [`
more imdb refinement 2010-07-10 08:24:56 +00:00			`lambda data: data.split('Series Crew')[0],`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'Writing credits</a>(.*?)</table>',`
			`'<a href="/name/.?/">(.?)</a>'`
			`],`
			`'type': 'list'`
			`},`
			`'year': {`
			`'page': 'combined',`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00			`'re': '<meta name="og:title" content=".?\((\d{4})\).?"',`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`'type': 'int'`
			`}`
			`}`

seasons 2010-07-12 08:52:26 +00:00			`def __init__(self, id, timeout=-1):`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`self.baseUrl = "http://www.imdb.com/title/tt%s/" % id`
seasons 2010-07-12 08:52:26 +00:00			`super(Imdb, self).__init__(timeout)`
add ox.web to this repos 2010-07-07 23:25:57 +00:00
seasons 2010-07-12 08:52:26 +00:00			`if 'title' in self and self['title'].startswith('"') and self['title'].endswith('"'):`
			`self['title'] = self['title'][1:-1]`
imdb parser fixes 2010-07-08 08:03:57 +00:00			`if 'runtime' in self and self['runtime']:`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`if 'min' in self['runtime']: base=60`
			`else: base=1`
			`self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base`
more imdb refinement 2010-07-10 08:24:56 +00:00			`if 'runtime' in self and not self['runtime']:`
			`del self['runtime']`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00			`if 'votes' in self: self['votes'] = self['votes'].replace(',', '')`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`if 'connections' in self:`
			`cc={}`
imdb parser fixes 2010-07-08 08:03:57 +00:00			`if len(self['connections']) == 2 and isinstance(self['connections'][0], basestring):`
			`self['connections'] = [self['connections']]`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`for rel, data in self['connections']:`
			`cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)`
			`self['connections'] = cc`

more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00			`for key in ('countries', 'genres'):`
more imdb refinement 2010-07-10 08:24:56 +00:00			`if key in self:`
			`self[key] = filter(lambda x: x.lower() != 'home', self[key])`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00
seasons 2010-07-12 08:52:26 +00:00			`if 'series' in self:`
			`if 'episode_title' in self:`
			`self['series_title'] = self['title']`
			`self['title'] = "%s: %s" % (self['series_title'], self['episode_title'])`
			`if 'episode_title' in self and 'season' in self and 'episode' in self:`
			`self['title'] = "%s (S%02dE%02d) %s" % (`
			`self['series_title'], self['season'], self['episode'], self['episode_title'])`
			`else:`
			`for key in ('series_title', 'episode_title', 'season', 'episode'):`
			`if key in self:`
			`del self[key]`
more imdb cleanup, add alternative_titles 2010-07-09 08:54:06 +00:00
somtimes its better to just make one imdb request 2010-07-18 18:24:36 +00:00			`class ImdbCombined(Imdb):`
			`def __init__(self, id, timeout=-1):`
			`_regex = {}`
			`for key in self.regex:`
			`if self.regex[key]['page'] == 'combined':`
			`_regex[key] = self.regex[key]`
			`self.regex = _regex`
			`super(ImdbCombined, self).__init__(id, timeout)`

fix criterion 2010-07-18 18:57:22 +00:00			`def getMovieId(title, director='', year=''):`
			`'''`
			`>>> getMovieId('The Matrix')`
			`'0133093'`
			`'''`
			`if year:`
			`title = "%s (%s)" % (title, year)`
			`if director:`
			`query = 'site:imdb.com %s "%s"' % (director, title)`
			`else:`
			`query = 'site:imdb.com "%s"' % title`
			`for (name, url, desc) in google.find(query, 5, timeout=-1):`
			`if url.startswith('http://www.imdb.com/title/tt'):`
			`return url[28:35]`
			`return ''`

poster urls 2010-07-19 10:05:01 +00:00			`def getMoviePoster(imdbId):`
			`info = ImdbCombined(imdbId)`
			`if 'poster_id' in info:`
			`url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['poster_id'], imdbId)`
			`data = readUrl(url)`
			`poster = findRe(data, 'img id="primary-img".?src="(.?)"')`
			`return poster`
			`return ''`

add ox.web to this repos 2010-07-07 23:25:57 +00:00			`def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):`
			`#FIXME: proper file -> title`
			`title = title.split('-')[0]`
			`title = title.split('(')[0]`
			`title = title.split('.')[0]`
			`title = title.strip()`
			`imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))`
			`return_url = ''`

			`#lest first try google`
more imdb refinement 2010-07-10 08:24:56 +00:00			`#i.e. site:imdb.com Michael Stevens "Sin"`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`if director:`
			`search = 'site:imdb.com %s "%s"' % (director, title)`
			`else:`
			`search = 'site:imdb.com "%s"' % title`
			`for (name, url, desc) in google.find(search, 2, timeout=timeout):`
			`if url.startswith('http://www.imdb.com/title/tt'):`
			`return normalizeImdbId(int(ox.intValue(url)))`

			`try:`
			`req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)`
			`u = urllib2.urlopen(req)`
			`data = u.read()`
			`return_url = u.url`
			`u.close()`
			`except:`
			`return None`
			`if return_url.startswith('http://www.imdb.com/title/tt'):`
			`return return_url[28:35]`
			`if data:`
			`imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.?<ol><li>.?<a href="/title/tt(.......)')`
			`if imdb_id:`
			`return imdb_id`

			`imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))`
			`req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)`
			`u = urllib2.urlopen(req)`
			`data = u.read()`
			`return_url = u.url`
			`u.close()`
			`if return_url.startswith('http://www.imdb.com/title/tt'):`
			`return return_url[28:35]`

			`return None`


			`if __name__ == "__main__":`
			`import json`
			`print json.dumps(Imdb('0306414'), indent=2)`
			`#print json.dumps(Imdb('0133093'), indent=2)`