# -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 import urllib2 from urllib import quote import re, time import os import time import utils import chardet cache_base = "/var/cache/scrapeit/cache/" cache_timeout = 30*24*60*60 # 30 days def read_url(url): cache_file = os.path.join(cache_base, url.replace('http://','')) if cache_file.endswith('/'): cache_file = "%sindex.html" % cache_file if os.path.isdir(cache_file): cache_file = os.path.join(cache_file, "index.html") if os.path.exists(cache_file): ctime = os.stat(cache_file).st_ctime now = time.mktime(time.localtime()) file_age = now-ctime if file_age < cache_timeout: f = open(cache_file) data = f.read() f.close() return data data = utils.read_url(url) folder = os.path.dirname(cache_file) if not os.path.exists(folder): os.makedirs(folder) f = open(cache_file, 'w') f.write(data) f.close() return data def read_url_utf8(url): data = read_url(url) encoding = chardet.detect(data)['encoding'] if not encoding: encoding = 'latin-1' data = unicode(data, encoding) return data