parse locations
This commit is contained in:
parent
d23720b1ff
commit
c10645f114
1 changed files with 21 additions and 25 deletions
|
@ -19,31 +19,11 @@ import chardet
|
||||||
cache_base = "/var/cache/scrapeit/cache/"
|
cache_base = "/var/cache/scrapeit/cache/"
|
||||||
|
|
||||||
def read_url_utf8(url):
|
def read_url_utf8(url):
|
||||||
path = os.path.join(cache_base, url.replace('http://',''))
|
data = read_url(url)
|
||||||
if path.endswith('/'):
|
encoding = chardet.detect(data)['encoding']
|
||||||
path = "%sindex.html" % path
|
if not encoding: encoding = 'latin-1'
|
||||||
if os.path.isdir(path):
|
data = unicode(data, encoding)
|
||||||
path = "%s/index.html" % path
|
return data
|
||||||
if os.path.exists(path):
|
|
||||||
f = open(path)
|
|
||||||
data = f.read()
|
|
||||||
encoding = chardet.detect(data)['encoding']
|
|
||||||
if not encoding: encoding = 'latin-1'
|
|
||||||
f.close()
|
|
||||||
data = unicode(data, encoding)
|
|
||||||
return data
|
|
||||||
else:
|
|
||||||
data = utils.read_url(url)
|
|
||||||
folder = os.path.dirname(path)
|
|
||||||
if not os.path.exists(folder):
|
|
||||||
os.makedirs(folder)
|
|
||||||
f = open(path, 'w')
|
|
||||||
f.write(data)
|
|
||||||
f.close()
|
|
||||||
encoding = chardet.detect(data)['encoding']
|
|
||||||
if not encoding: encoding = 'latin-1'
|
|
||||||
data = unicode(data, encoding)
|
|
||||||
return data
|
|
||||||
|
|
||||||
def read_url(url):
|
def read_url(url):
|
||||||
path = os.path.join(cache_base, url.replace('http://',''))
|
path = os.path.join(cache_base, url.replace('http://',''))
|
||||||
|
@ -124,6 +104,8 @@ class IMDb:
|
||||||
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
||||||
self.triviaSource = None
|
self.triviaSource = None
|
||||||
self.triviaUrl = "%strivia" % self.pageUrl
|
self.triviaUrl = "%strivia" % self.pageUrl
|
||||||
|
self.locationSource = None
|
||||||
|
self.locationUrl = "%slocations" % self.pageUrl
|
||||||
|
|
||||||
def getPage(self, forcereload = False):
|
def getPage(self, forcereload = False):
|
||||||
if forcereload or not self.pageSource:
|
if forcereload or not self.pageSource:
|
||||||
|
@ -264,6 +246,7 @@ class IMDb:
|
||||||
|
|
||||||
IMDbDict['trivia'] = self.parseTrivia()
|
IMDbDict['trivia'] = self.parseTrivia()
|
||||||
IMDbDict['connections'] = self.parseConnections()
|
IMDbDict['connections'] = self.parseConnections()
|
||||||
|
IMDbDict['locations'] = self.parseLocations()
|
||||||
IMDbDict['release_date'] = self.parseReleaseinfo()
|
IMDbDict['release_date'] = self.parseReleaseinfo()
|
||||||
IMDbDict['business'] = self.parseBusiness()
|
IMDbDict['business'] = self.parseBusiness()
|
||||||
self.IMDbDict = IMDbDict
|
self.IMDbDict = IMDbDict
|
||||||
|
@ -341,6 +324,19 @@ class IMDb:
|
||||||
self.episodes = episodes
|
self.episodes = episodes
|
||||||
return self.episodes
|
return self.episodes
|
||||||
|
|
||||||
|
def getLocations(self, forcereload = False):
|
||||||
|
if forcereload or not self.locationSource:
|
||||||
|
self.keywordSource = read_url_utf8(self.locationUrl)
|
||||||
|
return self.keywordSource
|
||||||
|
|
||||||
|
def parseLocations(self):
|
||||||
|
soup = BeautifulSoup(self.getLocations())
|
||||||
|
locations = []
|
||||||
|
for key in soup('a', {'href': re.compile('^/List')}):
|
||||||
|
locations.append(htmldecode(key.string))
|
||||||
|
self.locations = locations
|
||||||
|
return self.locations
|
||||||
|
|
||||||
def getKeywords(self, forcereload = False):
|
def getKeywords(self, forcereload = False):
|
||||||
if forcereload or not self.keywordSource:
|
if forcereload or not self.keywordSource:
|
||||||
self.keywordSource = read_url_utf8(self.keywordUrl)
|
self.keywordSource = read_url_utf8(self.keywordUrl)
|
||||||
|
|
Loading…
Reference in a new issue