scrapeit/scrapeit/cache.py

49 lines
1.1 KiB
Python

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import urllib2
from urllib import quote
import re, time
import os
import time
import utils
import chardet
cache_base = "/var/cache/scrapeit/cache/"
cache_timeout = 30*24*60*60 # 30 days
def read_url(url):
cache_file = os.path.join(cache_base, url.replace('http://',''))
if cache_file.endswith('/'):
cache_file = "%sindex.html" % cache_file
if os.path.isdir(cache_file):
cache_file = os.path.join(cache_file, "index.html")
if os.path.exists(cache_file):
ctime = os.stat(cache_file).st_ctime
now = time.mktime(time.localtime())
file_age = now-ctime
if file_age < cache_timeout:
f = open(cache_file)
data = f.read()
f.close()
return data
data = utils.read_url(url)
folder = os.path.dirname(cache_file)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(cache_file, 'w')
f.write(data)
f.close()
return data
def read_url_utf8(url):
data = read_url(url)
encoding = chardet.detect(data)['encoding']
if not encoding: encoding = 'latin-1'
data = unicode(data, encoding)
return data