scrapeit/scrapeit/cache.py

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2

import urllib2
from urllib import quote
import re, time
import os
import time

import utils
import chardet


cache_base = "/var/cache/scrapeit/cache/"
cache_timeout = 30*24*60*60 # 30 days

def read_url(url):
  cache_file = os.path.join(cache_base, url.replace('http://',''))
  if cache_file.endswith('/'):
    cache_file = "%sindex.html" % cache_file
  if os.path.isdir(cache_file):
    cache_file = os.path.join(cache_file, "index.html")
  if os.path.exists(cache_file):
    ctime = os.stat(cache_file).st_ctime
    now = time.mktime(time.localtime())
    file_age = now-ctime
    if file_age < cache_timeout:
      f = open(cache_file)
      data = f.read()
      f.close()
      return data
  data = utils.read_url(url)
  folder = os.path.dirname(cache_file)
  if not os.path.exists(folder):
    os.makedirs(folder)
  f = open(cache_file, 'w')
  f.write(data)
  f.close()
  return data

def read_url_utf8(url):
  data = read_url(url)
  encoding = chardet.detect(data)['encoding']
  if not encoding: encoding = 'latin-1'
  data = unicode(data, encoding)
  return data