cache read_url_utf8 too

This commit is contained in:
j 2007-07-14 11:25:50 +00:00
parent 23e27d1b36
commit c434187130

View file

@ -11,13 +11,35 @@ from elementtree.ElementTree import parse, tostring
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
from google import google from google import google
from utils import stripTags, read_url_utf8, htmldecode from utils import stripTags, htmldecode
import utils import utils
cache_base = "/var/cache/scrapeit/cache/"
def read_url_utf8(url):
path = os.path.join(cache_base, url.replace('http://',''))
if path.endswith('/'):
path = "%sindex.html" % path
if os.path.isdir(path):
path = "%s/index.html" % path
if os.path.exists(path):
f = open(path)
data = f.read()
f.close()
return data
else:
data = utils.read_url_utf8(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(path, 'w')
f.write(data)
f.close()
return data
def read_url(url): def read_url(url):
base = "/var/cache/scrapeit/cache/" path = os.path.join(cache_base, url.replace('http://',''))
path = os.path.join(base, url.replace('http://',''))
if path.endswith('/'): if path.endswith('/'):
path = "%sindex.html" % path path = "%sindex.html" % path
if os.path.isdir(path): if os.path.isdir(path):