# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import urllib2 from urllib import quote, unquote import re import os import time import ox from ox import findRe from ox.normalize import normalizeTitle, normalizeImdbId from siteparser import SiteParser import google class Imdb(SiteParser): regex = { 'cast': { 'page': 'combined', 're': '.*?>(.*?).*?(.*?)', 'type': 'list' }, 'cinematographers': { 'page': 'combined', 're': [ 'Cinematography by(.*?)', '(.*?)' ], 'type': 'list' }, 'connections': { 'page': 'movieconnections', 're': '
(.*?)
(.*?)\n\n', 'type': 'list' }, 'countries': { 'page': 'combined', 're': '(.*?)', 'type': 'list' }, 'directors': { 'page': 'combined', 're': [ 'Directed by(.*?)', '(.*?)' ], 'type': 'list' }, 'editors': { 'page': 'combined', 're': [ 'Film Editing by(.*?)', '(.*?)' ], 'type': 'list' }, 'filming_locations': { 'page': 'locations', 're': '(.*?)', 'type': 'list' }, 'genres': { 'page': 'combined', 're': '(.*?)', 'type': 'list' }, 'keywords': { 'page': 'keywords', 're': '(.*?)', 'type': 'list' }, 'languages': { 'page': 'combined', 're': '(.*?)', 'type': 'list' }, 'plot': { 'page': 'plotsummary', 're': '

(.*?)', 'type': 'string' }, 'poster_id': { 'page': 'combined', 're': '/primary-photo/media/rm(.*?)/tt', 'type': 'list' }, 'poster_ids': { 'page': 'posters', 're': '/unknown-thumbnail/media/rm(.*?)/tt', 'type': 'list' }, 'producers': { 'page': 'combined', 're': [ 'Produced by(.*?)', '(.*?)' ], 'type': 'list' }, 'rating': { 'page': 'combined', 're': '

.*?(.*?)/10', 'type': 'float' }, 'release_date': { 'page': 'releaseinfo', 're': '.*? ', 'type': 'date' }, 'reviews': { 'page': 'externalreviews', 're': [ '
    (.*?)
', '
  • (.*?)
  • ' ], 'type': 'list' }, 'runtime': { 'page': 'combined', 're': '
    Runtime:
    .*?([0-9]+ sec|[0-9]+ min).*?
    ', 'type': 'string' }, 'title': { 'page': 'combined', 're': '

    (.*?) ', 'type': 'string' }, 'trivia': { 'page': 'trivia', 're': '
    (.*?)
    ', 'type': 'list', }, 'votes': { 'page': 'combined', 're': '(.*?) votes', 'type': 'string' }, 'writers': { 'page': 'combined', 're': [ 'Writing credits(.*?)', '(.*?)' ], 'type': 'list' }, 'year': { 'page': 'combined', 're': '').findall(data) self['connections'] = cc def guess(title, director='', timeout=google.DEFAULT_TIMEOUT): #FIXME: proper file -> title title = title.split('-')[0] title = title.split('(')[0] title = title.split('.')[0] title = title.strip() imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) return_url = '' #lest first try google #i.e. site:imdb.com Michael Stevens Sin if director: search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title for (name, url, desc) in google.find(search, 2, timeout=timeout): if url.startswith('http://www.imdb.com/title/tt'): return normalizeImdbId(int(ox.intValue(url))) try: req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS) u = urllib2.urlopen(req) data = u.read() return_url = u.url u.close() except: return None if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?
    1. .*?