49 lines
1.1 KiB
Python
49 lines
1.1 KiB
Python
# -*- Mode: Python; -*-
|
|
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=2:sts=2:ts=2
|
|
|
|
import urllib2
|
|
from urllib import quote
|
|
import re, time
|
|
import os
|
|
import time
|
|
|
|
import utils
|
|
import chardet
|
|
|
|
|
|
cache_base = "/var/cache/scrapeit/cache/"
|
|
cache_timeout = 30*24*60*60 # 30 days
|
|
|
|
def read_url(url):
|
|
cache_file = os.path.join(cache_base, url.replace('http://',''))
|
|
if cache_file.endswith('/'):
|
|
cache_file = "%sindex.html" % cache_file
|
|
if os.path.isdir(cache_file):
|
|
cache_file = os.path.join(cache_file, "index.html")
|
|
if os.path.exists(cache_file):
|
|
ctime = os.stat(cache_file).st_ctime
|
|
now = time.mktime(time.localtime())
|
|
file_age = now-ctime
|
|
if file_age < cache_timeout:
|
|
f = open(cache_file)
|
|
data = f.read()
|
|
f.close()
|
|
return data
|
|
data = utils.read_url(url)
|
|
folder = os.path.dirname(cache_file)
|
|
if not os.path.exists(folder):
|
|
os.makedirs(folder)
|
|
f = open(cache_file, 'w')
|
|
f.write(data)
|
|
f.close()
|
|
return data
|
|
|
|
def read_url_utf8(url):
|
|
data = read_url(url)
|
|
encoding = chardet.detect(data)['encoding']
|
|
if not encoding: encoding = 'latin-1'
|
|
data = unicode(data, encoding)
|
|
return data
|
|
|