parse locations

This commit is contained in:
j 2007-07-17 10:37:14 +00:00
parent d23720b1ff
commit c10645f114

View file

@ -19,31 +19,11 @@ import chardet
cache_base = "/var/cache/scrapeit/cache/" cache_base = "/var/cache/scrapeit/cache/"
def read_url_utf8(url): def read_url_utf8(url):
path = os.path.join(cache_base, url.replace('http://','')) data = read_url(url)
if path.endswith('/'): encoding = chardet.detect(data)['encoding']
path = "%sindex.html" % path if not encoding: encoding = 'latin-1'
if os.path.isdir(path): data = unicode(data, encoding)
path = "%s/index.html" % path return data
if os.path.exists(path):
f = open(path)
data = f.read()
encoding = chardet.detect(data)['encoding']
if not encoding: encoding = 'latin-1'
f.close()
data = unicode(data, encoding)
return data
else:
data = utils.read_url(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(path, 'w')
f.write(data)
f.close()
encoding = chardet.detect(data)['encoding']
if not encoding: encoding = 'latin-1'
data = unicode(data, encoding)
return data
def read_url(url): def read_url(url):
path = os.path.join(cache_base, url.replace('http://','')) path = os.path.join(cache_base, url.replace('http://',''))
@ -124,6 +104,8 @@ class IMDb:
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
self.triviaSource = None self.triviaSource = None
self.triviaUrl = "%strivia" % self.pageUrl self.triviaUrl = "%strivia" % self.pageUrl
self.locationSource = None
self.locationUrl = "%slocations" % self.pageUrl
def getPage(self, forcereload = False): def getPage(self, forcereload = False):
if forcereload or not self.pageSource: if forcereload or not self.pageSource:
@ -264,6 +246,7 @@ class IMDb:
IMDbDict['trivia'] = self.parseTrivia() IMDbDict['trivia'] = self.parseTrivia()
IMDbDict['connections'] = self.parseConnections() IMDbDict['connections'] = self.parseConnections()
IMDbDict['locations'] = self.parseLocations()
IMDbDict['release_date'] = self.parseReleaseinfo() IMDbDict['release_date'] = self.parseReleaseinfo()
IMDbDict['business'] = self.parseBusiness() IMDbDict['business'] = self.parseBusiness()
self.IMDbDict = IMDbDict self.IMDbDict = IMDbDict
@ -341,6 +324,19 @@ class IMDb:
self.episodes = episodes self.episodes = episodes
return self.episodes return self.episodes
def getLocations(self, forcereload = False):
if forcereload or not self.locationSource:
self.keywordSource = read_url_utf8(self.locationUrl)
return self.keywordSource
def parseLocations(self):
soup = BeautifulSoup(self.getLocations())
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string))
self.locations = locations
return self.locations
def getKeywords(self, forcereload = False): def getKeywords(self, forcereload = False):
if forcereload or not self.keywordSource: if forcereload or not self.keywordSource:
self.keywordSource = read_url_utf8(self.keywordUrl) self.keywordSource = read_url_utf8(self.keywordUrl)