cache read_url_utf8 too
This commit is contained in:
parent
23e27d1b36
commit
c434187130
1 changed files with 25 additions and 3 deletions
|
@ -11,13 +11,35 @@ from elementtree.ElementTree import parse, tostring
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
from google import google
|
from google import google
|
||||||
from utils import stripTags, read_url_utf8, htmldecode
|
from utils import stripTags, htmldecode
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
|
|
||||||
|
cache_base = "/var/cache/scrapeit/cache/"
|
||||||
|
|
||||||
|
def read_url_utf8(url):
|
||||||
|
path = os.path.join(cache_base, url.replace('http://',''))
|
||||||
|
if path.endswith('/'):
|
||||||
|
path = "%sindex.html" % path
|
||||||
|
if os.path.isdir(path):
|
||||||
|
path = "%s/index.html" % path
|
||||||
|
if os.path.exists(path):
|
||||||
|
f = open(path)
|
||||||
|
data = f.read()
|
||||||
|
f.close()
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
data = utils.read_url_utf8(url)
|
||||||
|
folder = os.path.dirname(path)
|
||||||
|
if not os.path.exists(folder):
|
||||||
|
os.makedirs(folder)
|
||||||
|
f = open(path, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
return data
|
||||||
|
|
||||||
def read_url(url):
|
def read_url(url):
|
||||||
base = "/var/cache/scrapeit/cache/"
|
path = os.path.join(cache_base, url.replace('http://',''))
|
||||||
path = os.path.join(base, url.replace('http://',''))
|
|
||||||
if path.endswith('/'):
|
if path.endswith('/'):
|
||||||
path = "%sindex.html" % path
|
path = "%sindex.html" % path
|
||||||
if os.path.isdir(path):
|
if os.path.isdir(path):
|
||||||
|
|
Loading…
Reference in a new issue