use cookie to get us titles from imdb

This commit is contained in:
j 2010-10-08 18:07:39 +02:00
parent a106f0cfd7
commit 2da500d26e
2 changed files with 16 additions and 7 deletions

View file

@ -9,11 +9,19 @@ import time
import ox import ox
from ox import findRe, stripTags from ox import findRe, stripTags
from ox.normalize import normalizeTitle, normalizeImdbId from ox.normalize import normalizeTitle, normalizeImdbId
import ox.cache
from ox.cache import readUrl from ox.cache import readUrl
from siteparser import SiteParser from siteparser import SiteParser
import google import google
def readUrl(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None):
headers = headers.copy()
headers["Cookie"] = 'session-id=061-6553581-0286357; uu=bl8Nra2zsmTjesDEOxamlwVkXrKj8h6ygOFd1LDhKNGwxHjk4LQopMCxSNGTU3nl88Ro5FCSHrKjUi2RoREt4SEhDZGA8Q4SILFsUfUFbhLFgr6EZTD4RYTFSEWWwr4UV+K+l/eivpfX51v2Y1JrhvCg/ZEg4QxRsLEcUYDivmGwwW3hINGNodNSvhGz0h6ypaRIUuPyHvWQ8paioNENkaDRndHw4r4RsKEt4SDRndHzwr4Rs9IesqPUWCLg4h6yoMGNISDRndHD4r4Rs9IesqPyHvLjom6Co=; cs=pReiGikHkbKk4Fhkk8Meyw5/E6t6mVT9+v+ACx7KZ/rpfwPtXklU/c7BdHWNegduvco3rq7p9+7eSVT9yK4Uvd5JVMtpSdz9/kliy+7BVP392hR17RoHzq1ad36dSlRdWF+Srs7fYurOSVS9XlkU3f5pVP3+SVS9vhkkzf; session-id-time=1286639981'
return ox.cache.readUrl(url, data, headers, timeout)
def readUrlUnicode(url, timeout=ox.cache.cache_timeout):
return ox.cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
class Imdb(SiteParser): class Imdb(SiteParser):
regex = { regex = {
@ -200,15 +208,13 @@ class Imdb(SiteParser):
} }
} }
def readUrlUnicode(self, url, timeout):
return readUrlUnicode(url, timeout)
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
super(Imdb, self).__init__(timeout) super(Imdb, self).__init__(timeout)
if 'alternative_titles' in self:
for t in self['alternative_titles']:
if len(t)>1 and 'imdb display title' in t[1]:
self['title'] = t[0]
if 'title' in self and self['title'].startswith('"') and self['title'].endswith('"'): if 'title' in self and self['title'].startswith('"') and self['title'].endswith('"'):
self['title'] = self['title'][1:-1] self['title'] = self['title'][1:-1]
if 'runtime' in self and self['runtime']: if 'runtime' in self and self['runtime']:
@ -246,7 +252,7 @@ class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
_regex = {} _regex = {}
for key in self.regex: for key in self.regex:
if self.regex[key]['page'] == 'combined' or key == 'alternative_titles': if self.regex[key]['page'] == 'combined':
_regex[key] = self.regex[key] _regex[key] = self.regex[key]
self.regex = _regex self.regex = _regex
super(ImdbCombined, self).__init__(id, timeout) super(ImdbCombined, self).__init__(id, timeout)

View file

@ -30,10 +30,13 @@ class SiteParser(dict):
def getUrl(self, page): def getUrl(self, page):
return "%s%s" % (self.baseUrl, page) return "%s%s" % (self.baseUrl, page)
def readUrlUnicode(self, url, timeout):
return readUrlUnicode(url, timeout=timeout)
def __init__(self, timeout=-1): def __init__(self, timeout=-1):
for key in self.regex: for key in self.regex:
url = self.getUrl(self.regex[key]['page']) url = self.getUrl(self.regex[key]['page'])
data = readUrlUnicode(url) data = self.readUrlUnicode(url, timeout)
if isinstance(self.regex[key]['re'], basestring): if isinstance(self.regex[key]['re'], basestring):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data) data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
data = cleanup(key, data, self.regex[key]['type']) data = cleanup(key, data, self.regex[key]['type'])