# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import urllib2
from urllib import quote, unquote
import re
import os
import time
import ox
from ox import findRe, stripTags
from ox.normalize import normalizeTitle, normalizeImdbId
import ox.cache
from ox.cache import readUrl
from siteparser import SiteParser
import google
def readUrl(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None):
headers = headers.copy()
headers["Cookie"] = 'session-id=061-6553581-0286357; uu=bl8Nra2zsmTjesDEOxamlwVkXrKj8h6ygOFd1LDhKNGwxHjk4LQopMCxSNGTU3nl88Ro5FCSHrKjUi2RoREt4SEhDZGA8Q4SILFsUfUFbhLFgr6EZTD4RYTFSEWWwr4UV+K+l/eivpfX51v2Y1JrhvCg/ZEg4QxRsLEcUYDivmGwwW3hINGNodNSvhGz0h6ypaRIUuPyHvWQ8paioNENkaDRndHw4r4RsKEt4SDRndHzwr4Rs9IesqPUWCLg4h6yoMGNISDRndHD4r4Rs9IesqPyHvLjom6Co=; cs=pReiGikHkbKk4Fhkk8Meyw5/E6t6mVT9+v+ACx7KZ/rpfwPtXklU/c7BdHWNegduvco3rq7p9+7eSVT9yK4Uvd5JVMtpSdz9/kliy+7BVP392hR17RoHzq1ad36dSlRdWF+Srs7fYurOSVS9XlkU3f5pVP3+SVS9vhkkzf; session-id-time=1286639981'
return ox.cache.readUrl(url, data, headers, timeout)
def readUrlUnicode(url, timeout=ox.cache.cache_timeout):
return ox.cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
class Imdb(SiteParser):
regex = {
'alternative_titles': {
'page': 'releaseinfo',
're': [
'name="akas".*?
(.*?)',
"td>(.*?).*?(.*?) | "
],
'type': 'list'
},
'cast': {
'page': 'combined',
're': [
'.*?>(.*?).*? | (.*?) | ',
lambda ll: [stripTags(l) for l in ll]
],
'type': 'list'
},
'cinematographers': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Cinematography by(.*?)',
'(.*?)'
],
'type': 'list'
},
'connections': {
'page': 'movieconnections',
're': '(.*?)
(.*?)\n\n',
'type': 'list'
},
'countries': {
'page': 'combined',
're': '(.*?)',
'type': 'list'
},
'directors': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Directed by(.*?)',
'(.*?)'
],
'type': 'list'
},
'editors': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Film Editing by(.*?)',
'(.*?)'
],
'type': 'list'
},
'episode_title': {
'page': 'combined',
're': '.*?
(.*?)',
'type': 'string'
},
'filming_locations': {
'page': 'locations',
're': '
(.*?)',
'type': 'list'
},
'genres': {
'page': 'combined',
're': '
(.*?)',
'type': 'list'
},
'keywords': {
'page': 'keywords',
're': '
(.*?)',
'type': 'list'
},
'languages': {
'page': 'combined',
're': '
(.*?)',
'type': 'list'
},
'original_title': {
'page': 'combined',
're': '',
'type': 'string'
},
'plot': {
'page': 'plotsummary',
're': '
.*?(.*?)',
'type': 'string'
},
'poster_id': {
'page': 'combined',
're': '/primary-photo/media/rm(.*?)/tt',
'type': 'string'
},
'poster_ids': {
'page': 'posters',
're': '/unknown-thumbnail/media/rm(.*?)/tt',
'type': 'list'
},
'producers': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Produced by(.*?)',
'(.*?)'
],
'type': 'list'
},
'rating': {
'page': 'combined',
're': '.*?
([\d,.]?)/10',
'type': 'float'
},
'release date': {
'page': 'releaseinfo',
're': '
.*? ',
'type': 'date'
},
'reviews': {
'page': 'externalreviews',
're': [
'(.*?)
',
'(.*?)'
],
'type': 'list'
},
'runtime': {
'page': 'combined',
're': 'Runtime:
.*?([0-9]+ sec|[0-9]+ min).*?
',
'type': 'string'
},
'season': {
'page': 'combined',
're': [
'Original Air Date:
.*?(.*?)
',
'\(Season (\d+), Episode \d+\)',
],
'type': 'int'
},
'episode': {
'page': 'combined',
're': [
'Original Air Date:
.*?(.*?)
',
'\(Season \d+, Episode (\d+)\)',
],
'type': 'int'
},
'series': {
'page': 'combined',
're': 'TV Series:
.*?(.*?)
',
'type': 'list',
},
'votes': {
'page': 'combined',
're': '([\d,]*?) votes',
'type': 'string'
},
'writers': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Writing credits(.*?)',
'
(.*?)'
],
'type': 'list'
},
'year': {
'page': 'combined',
're': '
').findall(data)
self['connections'] = cc
for key in ('countries', 'genres'):
if key in self:
self[key] = filter(lambda x: x.lower() != 'home', self[key])
if 'series' in self:
if 'episode_title' in self:
self['series_title'] = self['title']
self['title'] = "%s: %s" % (self['series_title'], self['episode_title'])
if 'episode_title' in self and 'season' in self and 'episode' in self:
self['title'] = "%s (S%02dE%02d) %s" % (
self['series_title'], self['season'], self['episode'], self['episode_title'])
else:
for key in ('series_title', 'episode_title', 'season', 'episode'):
if key in self:
del self[key]
class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1):
_regex = {}
for key in self.regex:
if self.regex[key]['page'] == 'combined':
_regex[key] = self.regex[key]
self.regex = _regex
super(ImdbCombined, self).__init__(id, timeout)
def getMovieId(title, director='', year=''):
'''
>>> getMovieId('The Matrix')
u'0133093'
>>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
u'0060304'
>>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
u'0060304'
'''
if director:
query = 'site:imdb.com %s "%s" ' % (director, title)
else:
query = 'site:imdb.com "%s" ' % title
if year:
query += year
for (name, url, desc) in google.find(query, 5, timeout=-1):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
return ''
def getMoviePoster(imdbId):
'''
>>> getMoviePoster('0133093')
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
>>> getMoviePoster('0994352')
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
'''
info = ImdbCombined(imdbId)
if 'poster_id' in info:
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['poster_id'], imdbId)
data = readUrl(url)
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
return poster
elif 'series' in info:
return getMoviePoster(info['series'])
return ''
def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
#FIXME: proper file -> title
title = title.split('-')[0]
title = title.split('(')[0]
title = title.split('.')[0]
title = title.strip()
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
return_url = ''
#lest first try google
#i.e. site:imdb.com Michael Stevens "Sin"
if director:
search = 'site:imdb.com %s "%s"' % (director, title)
else:
search = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(search, 2, timeout=timeout):
if url.startswith('http://www.imdb.com/title/tt'):
return normalizeImdbId(int(ox.intValue(url)))
try:
req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
u.close()
except:
return None
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
if data:
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?
- .*?