diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index f23282c..ef59fb1 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -11,13 +11,35 @@ from elementtree.ElementTree import parse, tostring from BeautifulSoup import BeautifulSoup from google import google -from utils import stripTags, read_url_utf8, htmldecode +from utils import stripTags, htmldecode import utils +cache_base = "/var/cache/scrapeit/cache/" + +def read_url_utf8(url): + path = os.path.join(cache_base, url.replace('http://','')) + if path.endswith('/'): + path = "%sindex.html" % path + if os.path.isdir(path): + path = "%s/index.html" % path + if os.path.exists(path): + f = open(path) + data = f.read() + f.close() + return data + else: + data = utils.read_url_utf8(url) + folder = os.path.dirname(path) + if not os.path.exists(folder): + os.makedirs(folder) + f = open(path, 'w') + f.write(data) + f.close() + return data + def read_url(url): - base = "/var/cache/scrapeit/cache/" - path = os.path.join(base, url.replace('http://','')) + path = os.path.join(cache_base, url.replace('http://','')) if path.endswith('/'): path = "%sindex.html" % path if os.path.isdir(path):