cache read_url_utf8 too
This commit is contained in:
parent
23e27d1b36
commit
c434187130
1 changed files with 25 additions and 3 deletions
|
@ -11,13 +11,35 @@ from elementtree.ElementTree import parse, tostring
|
|||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
from google import google
|
||||
from utils import stripTags, read_url_utf8, htmldecode
|
||||
from utils import stripTags, htmldecode
|
||||
|
||||
import utils
|
||||
|
||||
cache_base = "/var/cache/scrapeit/cache/"
|
||||
|
||||
def read_url_utf8(url):
|
||||
path = os.path.join(cache_base, url.replace('http://',''))
|
||||
if path.endswith('/'):
|
||||
path = "%sindex.html" % path
|
||||
if os.path.isdir(path):
|
||||
path = "%s/index.html" % path
|
||||
if os.path.exists(path):
|
||||
f = open(path)
|
||||
data = f.read()
|
||||
f.close()
|
||||
return data
|
||||
else:
|
||||
data = utils.read_url_utf8(url)
|
||||
folder = os.path.dirname(path)
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
f = open(path, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
return data
|
||||
|
||||
def read_url(url):
|
||||
base = "/var/cache/scrapeit/cache/"
|
||||
path = os.path.join(base, url.replace('http://',''))
|
||||
path = os.path.join(cache_base, url.replace('http://',''))
|
||||
if path.endswith('/'):
|
||||
path = "%sindex.html" % path
|
||||
if os.path.isdir(path):
|
||||
|
|
Loading…
Reference in a new issue