python-ox/ox/cache.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import gzip
import zlib
import hashlib
import os
import StringIO
import time
import urlparse
import urllib2
import sqlite3

import chardet
from ox.utils import json

import net
from net import DEFAULT_HEADERS, getEncoding


cache_timeout = 30*24*60*60 # default is 30 days

COMPRESS_TYPES = (
    'text/html',
    'text/plain',
    'text/xml',
    'application/xhtml+xml',
    'application/x-javascript',
    'application/javascript',
    'application/ecmascript',
    'application/rss+xml'
)

def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
    '''
      >>> status('http://google.com')
      200
      >>> status('http://google.com/mysearch')
      404
    '''
    headers = getHeaders(url, data, headers)
    return int(headers['status'])

def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
    '''
      >>> exists('http://google.com')
      True
      >>> exists('http://google.com/mysearch')
      False
    '''
    s = status(url, data, headers, timeout)
    if s >= 200 and s < 400:
        return True
    return False

def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
    url_headers = _readUrlCache(url, data, headers, timeout, "headers")
    if url_headers:
        url_headers = json.loads(url_headers)
    else:
        url_headers = net.getHeaders(url, data, headers)
        _saveUrlCache(url, data, -1, url_headers)
    return url_headers

class InvalidResult(Exception):
    """Base class for exceptions in this module."""
    def __init__(self, result, headers):
        self.result = result
        self.headers = headers

def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None):
    '''
        url     - url to load
        data    - possible post data
        headers - headers to send with request
        timeout - get from cache if cache not older than given seconds, -1 to get from cache
        valid   - function to check if result is ok, its passed result and headers
                  if this function fails, InvalidResult will be raised deal with it in your code 
    '''
    #FIXME: send last-modified / etag from cache and only update if needed
    if isinstance(url, unicode):
        url = url.encode('utf-8')
    result = _readUrlCache(url, data, headers, timeout)
    if not result:
        #print "get data", url
        try:
            url_headers, result = net.readUrl(url, data, headers, returnHeaders=True)
        except urllib2.HTTPError, e:
            e.headers['Status'] = "%s" % e.code
            url_headers = dict(e.headers)
            result = e.read()
            if url_headers.get('content-encoding', None) == 'gzip':
                result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
        if not valid or valid(result, url_headers):
            _saveUrlCache(url, data, result, url_headers)
        else:
            raise InvalidResult(result, url_headers)
    return result

def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _readUrl=readUrl, valid=None):
    data = _readUrl(url, data, headers, timeout, valid)
    encoding = getEncoding(data)
    if not encoding:
        encoding = 'latin-1'
    return unicode(data, encoding)

def saveUrl(url, filename, overwrite=False):
    if not os.path.exists(filename) or overwrite:
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        data = readUrl(url)
        f = open(filename, 'w')
        f.write(data)
        f.close()

def _getCacheBase():
    'cache base is eather ~/.ox/cache or can set via env variable oxCACHE'
    return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))

def _getCacheDB():
    path = _getCacheBase()
    if not os.path.exists(path):
        os.makedirs(path)
    return os.path.join(path, "cache.sqlite")

def _connectDb():
    conn = sqlite3.connect(_getCacheDB(), timeout=10)
    conn.text_factory = str
    return conn

def _getSetting(c, key, default=None):
    c.execute('SELECT value FROM setting WHERE key = ?', (key, ))
    for row in c:
        return row[0]
    return default

def _setSetting(c, key, value):
    c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))

def _createDb(conn, c):
    # Create table and indexes 
    c.execute('''CREATE TABLE IF NOT EXISTS cache (url_hash varchar(42) unique, domain text, url text,
                      post_data text, headers text, created int, data blob, only_headers int)''')
    c.execute('''CREATE INDEX IF NOT EXISTS cache_domain ON cache (domain)''')
    c.execute('''CREATE INDEX IF NOT EXISTS cache_url ON cache (url)''')
    c.execute('''CREATE INDEX IF NOT EXISTS cache_url_hash ON cache (url_hash)''')

    c.execute('''CREATE TABLE IF NOT EXISTS setting (key varchar(1024) unique, value text)''')
    if int(_getSetting(c, 'version', 0)) < 1:
        _setSetting(c, 'version', 1)
        c.execute('''ALTER TABLE cache ADD compressed INT DEFAULT 0''')
        conn.commit()

def _readUrlCache(url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
    r = None
    if timeout == 0:
        return r

    if data:
        url_hash = hashlib.sha1(url + '?' + data).hexdigest()
    else:
        url_hash = hashlib.sha1(url).hexdigest()

    conn = _connectDb()
    c = conn.cursor()
    _createDb(conn, c)

    sql = 'SELECT %s, compressed FROM cache WHERE url_hash=?' % value
    if timeout > 0:
        now = time.mktime(time.localtime())
        t = (url_hash, now-timeout)
        sql += ' AND created > ?'
    else:
        t = (url_hash, )
    if value != "headers":
        sql += ' AND only_headers != 1 '
    c.execute(sql, t)
    for row in c:
        r = row[0]
        if value == 'data':
            if row[1] == 1:
                r = zlib.decompress(r)
            else:
                r = str(r)
        break

    c.close()
    conn.close()
    return r

def _saveUrlCache(url, post_data, data, headers):
    if post_data:
        url_hash = hashlib.sha1(url + '?' + post_data).hexdigest()
    else:
        url_hash = hashlib.sha1(url).hexdigest()

    domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])

    conn = _connectDb()
    c = conn.cursor()

    # Create table if not exists
    _createDb(conn, c)

    # Insert a row of data
    if not post_data: post_data=""
    only_headers = 0
    if data == -1:
        only_headers = 1
        data = ""
    created = time.mktime(time.localtime())
    content_type = headers.get('content-type', '').split(';')[0].strip()
    if content_type in COMPRESS_TYPES:
        compressed = 1
        data = zlib.compress(data)
    else:
        compressed = 0
    data = sqlite3.Binary(data)
    t = (url_hash, domain, url, post_data, json.dumps(headers), created, data, only_headers, compressed)
    c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?, ?)""", t)

    # Save (commit) the changes and clean up
    conn.commit()
    c.close()
    conn.close()

def migrate_to_db():
    import re
    import os
    import sqlite3
    import glob

    conn = _connectDb()
    c = conn.cursor()
    _createDb(conn, c)

    files = glob.glob(_getCacheBase() + "/*/*/*/*/*")
    _files = filter(lambda x: not x.endswith(".headers"), files)

    for f in _files:
        info = re.compile("%s/(.*?)/../../../(.*)" % _getCacheBase()).findall(f)
        domain = url = info[0][0]
        url_hash = info[0][1]
        post_data = ""
        created = os.stat(f).st_ctime
        fd = open(f, "r")
        data = fd.read()
        fd.close()
        fd = open(f + ".headers", "r")
        headers = fd.read()
        fd.close()
        t = (url_hash, domain, url, post_data, headers, created, sqlite3.Binary(data), 0)
        c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?)""", t)

    conn.commit()
    c.close()
    conn.close()

def compress_db():
    conn = _connectDb()
    c = conn.cursor()
    _createDb(conn, c)
    c.execute(u"""SELECT url_hash FROM cache WHERE compressed = 0""")
    ids = [row[0] for row in c]
    for url_hash in ids:
        c.execute(u"""SELECT headers, data FROM cache WHERE url_hash = ?""", (url_hash, ))
        headers = {}
        for row in c:
            headers = json.loads(row[0])
            data = row[1]
        
        content_type = headers.get('content-type', '').split(';')[0].strip()
        if content_type in COMPRESS_TYPES:
            data = zlib.compress(data)
            t = (sqlite3.Binary(data), url_hash)
            print url_hash, 'update'
            c.execute('UPDATE cache SET compressed = 1, data = ? WHERE url_hash = ?', t)

    conn.commit()
    print "optimizing database"
    c.execute('VACUUM')
    conn.commit()
    c.close()
    conn.close()
add some functions 2008-04-27 16:54:37 +00:00			`# -- coding: utf-8 --`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`# vi:si:et:sw=4:sts=4:ts=4`
move and rename some 2008-07-06 13:00:06 +00:00			`# GPL 2008`
allow custom getUrl to be passed to getUrlUnicode, error pages can be gziped too 2008-05-04 14:08:43 +00:00			`import gzip`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`import zlib`
use hashlib instead of sha 2009-03-16 17:15:14 +00:00			`import hashlib`
add some functions 2008-04-27 16:54:37 +00:00			`import os`
use hashlib instead of sha 2009-03-16 17:15:14 +00:00			`import StringIO`
add some functions 2008-04-27 16:54:37 +00:00			`import time`
			`import urlparse`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00			`import urllib2`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`import sqlite3`
add some functions 2008-04-27 16:54:37 +00:00
add htmldecode, trimString, import missing chardet 2008-04-28 09:50:34 +00:00			`import chardet`
import json/simplejson in one place and use that 2010-07-28 13:08:06 +00:00			`from ox.utils import json`
add htmldecode, trimString, import missing chardet 2008-04-28 09:50:34 +00:00
add some functions 2008-04-27 16:54:37 +00:00			`import net`
faster way to detect encoding, speeds up getUrlUnicode on large pages 2008-06-17 10:53:29 +00:00			`from net import DEFAULT_HEADERS, getEncoding`
add some functions 2008-04-27 16:54:37 +00:00

_cache_timeout is public 2008-07-06 15:21:27 +00:00			`cache_timeout = 302460*60 # default is 30 days`
add some functions 2008-04-27 16:54:37 +00:00
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`COMPRESS_TYPES = (`
			`'text/html',`
			`'text/plain',`
			`'text/xml',`
			`'application/xhtml+xml',`
			`'application/x-javascript',`
			`'application/javascript',`
			`'application/ecmascript',`
			`'application/rss+xml'`
			`)`
use sqlite for cache 2009-08-20 17:27:11 +00:00
_cache_timeout is public 2008-07-06 15:21:27 +00:00			`def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'''`
			`>>> status('http://google.com')`
			`200`
			`>>> status('http://google.com/mysearch')`
			`404`
			`'''`
			`headers = getHeaders(url, data, headers)`
			`return int(headers['status'])`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
_cache_timeout is public 2008-07-06 15:21:27 +00:00			`def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'''`
			`>>> exists('http://google.com')`
			`True`
			`>>> exists('http://google.com/mysearch')`
			`False`
			`'''`
			`s = status(url, data, headers, timeout)`
			`if s >= 200 and s < 400:`
			`return True`
			`return False`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
_cache_timeout is public 2008-07-06 15:21:27 +00:00			`def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):`
change api, getUrl -> readUrl 2009-10-11 13:03:00 +00:00			`url_headers = _readUrlCache(url, data, headers, timeout, "headers")`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`if url_headers:`
import json/simplejson in one place and use that 2010-07-28 13:08:06 +00:00			`url_headers = json.loads(url_headers)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`else:`
			`url_headers = net.getHeaders(url, data, headers)`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`_saveUrlCache(url, data, -1, url_headers)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`return url_headers`
add some functions 2008-04-27 16:54:37 +00:00
add possible validation option to getUrl 2009-06-01 13:11:40 +00:00			`class InvalidResult(Exception):`
			`"""Base class for exceptions in this module."""`
			`def __init__(self, result, headers):`
			`self.result = result`
			`self.headers = headers`

change api, getUrl -> readUrl 2009-10-11 13:03:00 +00:00			`def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None):`
add possible validation option to getUrl 2009-06-01 13:11:40 +00:00			`'''`
			`url - url to load`
			`data - possible post data`
			`headers - headers to send with request`
			`timeout - get from cache if cache not older than given seconds, -1 to get from cache`
			`valid - function to check if result is ok, its passed result and headers`
			`if this function fails, InvalidResult will be raised deal with it in your code`
			`'''`
			`#FIXME: send last-modified / etag from cache and only update if needed`
url must be bytes 2008-07-29 17:26:26 +00:00			`if isinstance(url, unicode):`
			`url = url.encode('utf-8')`
change api, getUrl -> readUrl 2009-10-11 13:03:00 +00:00			`result = _readUrlCache(url, data, headers, timeout)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`if not result:`
remove debug 2009-08-20 17:34:32 +00:00			`#print "get data", url`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`try:`
change api, getUrl -> readUrl 2009-10-11 13:03:00 +00:00			`url_headers, result = net.readUrl(url, data, headers, returnHeaders=True)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`except urllib2.HTTPError, e:`
			`e.headers['Status'] = "%s" % e.code`
			`url_headers = dict(e.headers)`
			`result = e.read()`
			`if url_headers.get('content-encoding', None) == 'gzip':`
			`result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()`
add possible validation option to getUrl 2009-06-01 13:11:40 +00:00			`if not valid or valid(result, url_headers):`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`_saveUrlCache(url, data, result, url_headers)`
add possible validation option to getUrl 2009-06-01 13:11:40 +00:00			`else:`
			`raise InvalidResult(result, url_headers)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`return result`
add some functions 2008-04-27 16:54:37 +00:00
change api, getUrl -> readUrl 2009-10-11 13:03:00 +00:00			`def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _readUrl=readUrl, valid=None):`
			`data = _readUrl(url, data, headers, timeout, valid)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`encoding = getEncoding(data)`
			`if not encoding:`
			`encoding = 'latin-1'`
			`return unicode(data, encoding)`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
also include saveUrl in cache 2009-11-16 22:43:17 +00:00			`def saveUrl(url, filename, overwrite=False):`
			`if not os.path.exists(filename) or overwrite:`
			`dirname = os.path.dirname(filename)`
			`if not os.path.exists(dirname):`
			`os.makedirs(dirname)`
			`data = readUrl(url)`
			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`

rename to local functions in cache, remove capfirst, thats 'foo'.capitalize(), move tests to docts, add test.sh 2008-07-06 09:42:41 +00:00			`def _getCacheBase():`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'cache base is eather ~/.ox/cache or can set via env variable oxCACHE'`
			`return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))`
add some functions 2008-04-27 16:54:37 +00:00
use sqlite for cache 2009-08-20 17:27:11 +00:00			`def _getCacheDB():`
create folder for cache 2009-10-23 18:33:34 +00:00			`path = _getCacheBase()`
fix init code for db 2009-11-20 12:01:39 +00:00			`if not os.path.exists(path):`
			`os.makedirs(path)`
create folder for cache 2009-10-23 18:33:34 +00:00			`return os.path.join(path, "cache.sqlite")`
use sqlite for cache 2009-08-20 17:27:11 +00:00
			`def _connectDb():`
			`conn = sqlite3.connect(_getCacheDB(), timeout=10)`
			`conn.text_factory = str`
			`return conn`

compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`def _getSetting(c, key, default=None):`
			`c.execute('SELECT value FROM setting WHERE key = ?', (key, ))`
			`for row in c:`
			`return row[0]`
			`return default`

			`def _setSetting(c, key, value):`
			`c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))`

			`def _createDb(conn, c):`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`# Create table and indexes`
add domain 2009-08-20 17:38:10 +00:00			`c.execute('''CREATE TABLE IF NOT EXISTS cache (url_hash varchar(42) unique, domain text, url text,`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`post_data text, headers text, created int, data blob, only_headers int)''')`
			`c.execute('''CREATE INDEX IF NOT EXISTS cache_domain ON cache (domain)''')`
			`c.execute('''CREATE INDEX IF NOT EXISTS cache_url ON cache (url)''')`
			`c.execute('''CREATE INDEX IF NOT EXISTS cache_url_hash ON cache (url_hash)''')`

compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`c.execute('''CREATE TABLE IF NOT EXISTS setting (key varchar(1024) unique, value text)''')`
			`if int(_getSetting(c, 'version', 0)) < 1:`
			`_setSetting(c, 'version', 1)`
			`c.execute('''ALTER TABLE cache ADD compressed INT DEFAULT 0''')`
			`conn.commit()`
use sqlite for cache 2009-08-20 17:27:11 +00:00
change api, getUrl -> readUrl 2009-10-11 13:03:00 +00:00			`def _readUrlCache(url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`r = None`
			`if timeout == 0:`
			`return r`

vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`if data:`
use hashlib instead of sha 2009-03-16 17:15:14 +00:00			`url_hash = hashlib.sha1(url + '?' + data).hexdigest()`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`else:`
use hashlib instead of sha 2009-03-16 17:15:14 +00:00			`url_hash = hashlib.sha1(url).hexdigest()`
add some functions 2008-04-27 16:54:37 +00:00
use sqlite for cache 2009-08-20 17:27:11 +00:00			`conn = _connectDb()`
			`c = conn.cursor()`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`_createDb(conn, c)`
use sqlite for cache 2009-08-20 17:27:11 +00:00
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`sql = 'SELECT %s, compressed FROM cache WHERE url_hash=?' % value`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`if timeout > 0:`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`now = time.mktime(time.localtime())`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`t = (url_hash, now-timeout)`
			`sql += ' AND created > ?'`
			`else:`
			`t = (url_hash, )`
			`if value != "headers":`
			`sql += ' AND only_headers != 1 '`
			`c.execute(sql, t)`
			`for row in c:`
			`r = row[0]`
dont return buffer 2009-08-21 15:40:42 +00:00			`if value == 'data':`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`if row[1] == 1:`
			`r = zlib.decompress(r)`
			`else:`
			`r = str(r)`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`break`

			`c.close()`
			`conn.close()`
			`return r`

			`def _saveUrlCache(url, post_data, data, headers):`
			`if post_data:`
			`url_hash = hashlib.sha1(url + '?' + post_data).hexdigest()`
			`else:`
			`url_hash = hashlib.sha1(url).hexdigest()`

			`domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])`

			`conn = _connectDb()`
			`c = conn.cursor()`

			`# Create table if not exists`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`_createDb(conn, c)`
use sqlite for cache 2009-08-20 17:27:11 +00:00
			`# Insert a row of data`
			`if not post_data: post_data=""`
			`only_headers = 0`
			`if data == -1:`
			`only_headers = 1`
			`data = ""`
			`created = time.mktime(time.localtime())`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`content_type = headers.get('content-type', '').split(';')[0].strip()`
			`if content_type in COMPRESS_TYPES:`
			`compressed = 1`
			`data = zlib.compress(data)`
			`else:`
			`compressed = 0`
			`data = sqlite3.Binary(data)`
dont wrap data into Binary twice 2011-10-29 12:08:38 +00:00			`t = (url_hash, domain, url, post_data, json.dumps(headers), created, data, only_headers, compressed)`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?, ?)""", t)`
use sqlite for cache 2009-08-20 17:27:11 +00:00
			`# Save (commit) the changes and clean up`
			`conn.commit()`
			`c.close()`
			`conn.close()`

			`def migrate_to_db():`
			`import re`
			`import os`
			`import sqlite3`
			`import glob`

			`conn = _connectDb()`
			`c = conn.cursor()`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`_createDb(conn, c)`
use sqlite for cache 2009-08-20 17:27:11 +00:00
			`files = glob.glob(_getCacheBase() + "/////*")`
			`_files = filter(lambda x: not x.endswith(".headers"), files)`

			`for f in _files:`
			`info = re.compile("%s/(.?)/../../../(.)" % _getCacheBase()).findall(f)`
			`domain = url = info[0][0]`
			`url_hash = info[0][1]`
			`post_data = ""`
			`created = os.stat(f).st_ctime`
			`fd = open(f, "r")`
			`data = fd.read()`
			`fd.close()`
			`fd = open(f + ".headers", "r")`
			`headers = fd.read()`
			`fd.close()`
escape binary data 2009-08-21 15:18:03 +00:00			`t = (url_hash, domain, url, post_data, headers, created, sqlite3.Binary(data), 0)`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?)""", t)`

			`conn.commit()`
			`c.close()`
			`conn.close()`
add some functions 2008-04-27 16:54:37 +00:00
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`def compress_db():`
			`conn = _connectDb()`
			`c = conn.cursor()`
			`_createDb(conn, c)`
			`c.execute(u"""SELECT url_hash FROM cache WHERE compressed = 0""")`
			`ids = [row[0] for row in c]`
			`for url_hash in ids:`
			`c.execute(u"""SELECT headers, data FROM cache WHERE url_hash = ?""", (url_hash, ))`
			`headers = {}`
			`for row in c:`
			`headers = json.loads(row[0])`
			`data = row[1]`

			`content_type = headers.get('content-type', '').split(';')[0].strip()`
			`if content_type in COMPRESS_TYPES:`
			`data = zlib.compress(data)`
			`t = (sqlite3.Binary(data), url_hash)`
			`print url_hash, 'update'`
			`c.execute('UPDATE cache SET compressed = 1, data = ? WHERE url_hash = ?', t)`

			`conn.commit()`
			`print "optimizing database"`
			`c.execute('VACUUM')`
			`conn.commit()`
			`c.close()`
			`conn.close()`