python-ox/ox/cache.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2011
from __future__ import with_statement, print_function

import gzip
import zlib
import hashlib
import os
from six import BytesIO
import time
from six.moves import urllib
from six import PY2
import sqlite3

from .utils import json
from .file import makedirs

from . import net
from .net import DEFAULT_HEADERS, detect_encoding

cache_timeout = 30*24*60*60 # default is 30 days

COMPRESS_TYPES = (
    'text/html',
    'text/plain',
    'text/xml',
    'application/json',
    'application/xhtml+xml',
    'application/x-javascript',
    'application/javascript',
    'application/ecmascript',
    'application/rss+xml'
)

def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
    '''
      >>> status('http://google.com')
      200
      >>> status('http://google.com/mysearch')
      404
    '''
    headers = get_headers(url, data, headers)
    return int(headers['status'])

def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
    '''
      >>> exists('http://google.com')
      True
      >>> exists('http://google.com/mysearch')
      False
    '''
    s = status(url, data, headers, timeout)
    if s >= 200 and s < 400:
        return True
    return False

def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
    url_headers = store.get(url, data, headers, timeout, "headers")
    if not url_headers:
        url_headers = net.get_headers(url, data, headers)
        store.set(url, data, -1, url_headers)
    return url_headers

class InvalidResult(Exception):
    """Base class for exceptions in this module."""
    def __init__(self, result, headers):
        self.result = result
        self.headers = headers

def _fix_unicode_url(url):
    if not isinstance(url, bytes):
        url = url.encode('utf-8')
    return url

def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
    '''
        url     - url to load
        data    - possible post data
        headers - headers to send with request
        timeout - get from cache if cache not older than given seconds, -1 to get from cache
        valid   - function to check if result is ok, its passed result and headers
                  if this function fails, InvalidResult will be raised deal with it in your code 
    '''
    if net.DEBUG:
        print('ox.cache.read_url', url)
    #FIXME: send last-modified / etag from cache and only update if needed
    #url = _fix_unicode_url(url)
    result = store.get(url, data, headers, timeout)
    url_headers = {}
    if not result:
        try:
            url_headers, result = net.read_url(url, data, headers, return_headers=True)
        except urllib.error.HTTPError as e:
            e.headers['Status'] = "%s" % e.code
            for key in e.headers:
                url_headers[key.lower()] = e.headers[key]
            result = e.read()
            if url_headers.get('content-encoding', None) == 'gzip':
                result = gzip.GzipFile(fileobj=BytesIO(result)).read()
        if not valid or valid(result, url_headers):
            store.set(url, post_data=data, data=result, headers=url_headers)
        else:
            raise InvalidResult(result, url_headers)
    if unicode:
        ctype = url_headers.get('content-type', '').lower()
        if 'charset' in ctype:
            encoding = ctype.split('charset=')[-1]
        else:
            encoding = detect_encoding(result)
        if not encoding:
            encoding = 'latin-1'
        result = result.decode(encoding)
    return result

def save_url(url, filename, overwrite=False):
    if not os.path.exists(filename) or overwrite:
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        data = read_url(url)
        with open(filename, 'wb') as f:
            f.write(data)

def cache_path():
    return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))

class Cache:
    def __init__(self):
        pass

    def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
        '''
            if value == 'data' return data of url if its in the cache else None
            if value == 'headers' return headers for url
        '''
        pass

    def set(self, url, post_data, data, headers):
        pass

class SQLiteCache(Cache):
    def __init__(self):
        path = cache_path()
        if not os.path.exists(path):
            os.makedirs(path)
        self.db = os.path.join(path, "cache.sqlite")
        self.create()

    def connect(self):
        self.conn = sqlite3.connect(self.db, timeout=10)
        return self.conn

    def create(self):
        conn = self.connect()
        c = conn.cursor()
        # Create table and indexes 
        c.execute('''CREATE TABLE IF NOT EXISTS cache (url_hash varchar(42) unique, domain text, url text,
                          post_data text, headers text, created int, data blob, only_headers int)''')
        c.execute('''CREATE INDEX IF NOT EXISTS cache_domain ON cache (domain)''')
        c.execute('''CREATE INDEX IF NOT EXISTS cache_url ON cache (url)''')
        c.execute('''CREATE INDEX IF NOT EXISTS cache_url_hash ON cache (url_hash)''')

        c.execute('''CREATE TABLE IF NOT EXISTS setting (key varchar(1024) unique, value text)''')
        if int(self.get_setting(c, 'version', 0)) < 1:
            self.set_setting(c, 'version', 1)
            c.execute('''ALTER TABLE cache ADD compressed INT DEFAULT 0''')
            conn.commit()
    
    def get_setting(self, c, key, default=None):
        c.execute('SELECT value FROM setting WHERE key = ?', (key, ))
        for row in c:
            return row[0]
        return default

    def set_setting(self, c, key, value):
        c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))

    def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
        r = None
        if timeout == 0:
            return r
        if data:
            url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
        else:
            url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()

        conn = self.connect()
        c = conn.cursor()
        sql = 'SELECT %s, compressed FROM cache WHERE url_hash=?' % value
        if timeout > 0:
            now = time.mktime(time.localtime())
            t = (url_hash, now-timeout)
            sql += ' AND created > ?'
        else:
            t = (url_hash, )
        if value != "headers":
            sql += ' AND only_headers != 1 '
        c.execute(sql, t)
        for row in c:
            r = row[0]
            if value == 'headers':
                r = json.loads(r)
            elif value == 'data':
                if row[1] == 1:
                    r = zlib.decompress(r)
                elif PY2:
                    r = str(r)
            break

        c.close()
        conn.close()
        return r

    def set(self, url, post_data, data, headers):
        if post_data:
            url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
        else:
            url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()

        domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])

        conn = self.connect()
        c = conn.cursor()

        # Insert a row of data
        if not post_data: post_data=""
        only_headers = 0
        if data == -1:
            only_headers = 1
            data = ""
        created = time.mktime(time.localtime())
        content_type = headers.get('content-type', '').split(';')[0].strip()
        if content_type in COMPRESS_TYPES:
            compressed = 1
            data = zlib.compress(data)
        else:
            compressed = 0
        data = sqlite3.Binary(data)

        #fixme: this looks wrong
        try:
            _headers = json.dumps(headers)
        except:
            for h in headers:
                headers[h] = headers[h].decode(detect_encoding(headers[h]))
            _headers = json.dumps(headers)
        t = (url_hash, domain, url, post_data, _headers, created,
             data, only_headers, compressed)
        c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?, ?)""", t)

        # Save (commit) the changes and clean up
        conn.commit()
        c.close()
        conn.close()

class FileCache(Cache):
    def __init__(self):
        f, self.root = cache_path().split(':')

    def files(self, domain, h):
        prefix = os.path.join(self.root, domain, h[:2], h[2:4], h[4:6], h[6:8])
        i = os.path.join(prefix, '%s.json'%h)
        f = os.path.join(prefix, '%s.dat'%h)
        return prefix, i, f
 
    def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
        r = None
        if timeout == 0:
            return r

        if data:
            url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
        else:
            url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()

        domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
        prefix, i, f = self.files(domain, url_hash)
        if os.path.exists(i):
            with open(i) as _i:
                try:
                    info = json.load(_i)
                except:
                    return r
            now = time.mktime(time.localtime())
            expired = now-timeout

            if value != 'headers' and info['only_headers']:
                return None
            if timeout < 0 or info['created'] > expired:
                if value == 'headers':
                    r = info['headers']
                else:
                    with open(f, 'rb') as data:
                        r = data.read()
                    if info['compressed']:
                        r = zlib.decompress(r)
        return r

    def set(self, url, post_data, data, headers):
        if post_data:
            url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
        else:
            url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()

        domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
        prefix, i, f = self.files(domain, url_hash)
        makedirs(prefix)

        created = time.mktime(time.localtime())
        content_type = headers.get('content-type', '').split(';')[0].strip()

        info = {
            'compressed': content_type in COMPRESS_TYPES,
            'only_headers': data == -1,
            'created': created,
            'headers': headers,
            'url': url,
        }
        if post_data:
            info['post_data'] = post_data
        if not info['only_headers']:
            if info['compressed']:
                data = zlib.compress(data)
            elif not isinstance(data, str):
                data = data.encode('utf-8')
            with open(f, 'wb') as _f:
                _f.write(data)
        with open(i, 'wb') as _i:
            json.dump(info, _i)

if cache_path().startswith('fs:'):
    store = FileCache()
else:
    store = SQLiteCache()
add some functions 2008-04-27 16:54:37 +00:00			`# -- coding: utf-8 --`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`# vi:si:et:sw=4:sts=4:ts=4`
add file cache 2011-11-01 12:55:49 +00:00			`# GPL 2011`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`from __future__ import with_statement, print_function`
add file cache 2011-11-01 12:55:49 +00:00
allow custom getUrl to be passed to getUrlUnicode, error pages can be gziped too 2008-05-04 14:08:43 +00:00			`import gzip`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`import zlib`
use hashlib instead of sha 2009-03-16 17:15:14 +00:00			`import hashlib`
add some functions 2008-04-27 16:54:37 +00:00			`import os`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`from six import BytesIO`
add some functions 2008-04-27 16:54:37 +00:00			`import time`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`from six.moves import urllib`
more python3 cleanups 2014-10-02 08:28:22 +00:00			`from six import PY2`
use sqlite for cache 2009-08-20 17:27:11 +00:00			`import sqlite3`
add some functions 2008-04-27 16:54:37 +00:00
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`from .utils import json`
add file cache 2011-11-01 12:55:49 +00:00			`from .file import makedirs`
add htmldecode, trimString, import missing chardet 2008-04-28 09:50:34 +00:00
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`from . import net`
			`from .net import DEFAULT_HEADERS, detect_encoding`
add some functions 2008-04-27 16:54:37 +00:00
_cache_timeout is public 2008-07-06 15:21:27 +00:00			`cache_timeout = 302460*60 # default is 30 days`
add some functions 2008-04-27 16:54:37 +00:00
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`COMPRESS_TYPES = (`
			`'text/html',`
			`'text/plain',`
			`'text/xml',`
more python3 cleanups 2014-10-02 08:28:22 +00:00			`'application/json',`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`'application/xhtml+xml',`
			`'application/x-javascript',`
			`'application/javascript',`
			`'application/ecmascript',`
			`'application/rss+xml'`
			`)`
use sqlite for cache 2009-08-20 17:27:11 +00:00
_cache_timeout is public 2008-07-06 15:21:27 +00:00			`def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'''`
			`>>> status('http://google.com')`
			`200`
			`>>> status('http://google.com/mysearch')`
			`404`
			`'''`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`headers = get_headers(url, data, headers)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`return int(headers['status'])`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
_cache_timeout is public 2008-07-06 15:21:27 +00:00			`def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'''`
			`>>> exists('http://google.com')`
			`True`
			`>>> exists('http://google.com/mysearch')`
			`False`
			`'''`
			`s = status(url, data, headers, timeout)`
			`if s >= 200 and s < 400:`
			`return True`
			`return False`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):`
add file cache 2011-11-01 12:55:49 +00:00			`url_headers = store.get(url, data, headers, timeout, "headers")`
			`if not url_headers:`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`url_headers = net.get_headers(url, data, headers)`
add file cache 2011-11-01 12:55:49 +00:00			`store.set(url, data, -1, url_headers)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`return url_headers`
add some functions 2008-04-27 16:54:37 +00:00
add possible validation option to getUrl 2009-06-01 13:11:40 +00:00			`class InvalidResult(Exception):`
			`"""Base class for exceptions in this module."""`
			`def __init__(self, result, headers):`
			`self.result = result`
			`self.headers = headers`

fix ox.cache.read_url 2012-08-17 20:20:35 +00:00			`def _fix_unicode_url(url):`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`if not isinstance(url, bytes):`
fix ox.cache.read_url 2012-08-17 20:20:35 +00:00			`url = url.encode('utf-8')`
			`return url`

net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):`
add possible validation option to getUrl 2009-06-01 13:11:40 +00:00			`'''`
			`url - url to load`
			`data - possible post data`
			`headers - headers to send with request`
			`timeout - get from cache if cache not older than given seconds, -1 to get from cache`
			`valid - function to check if result is ok, its passed result and headers`
			`if this function fails, InvalidResult will be raised deal with it in your code`
			`'''`
add read_url debug output 2012-08-21 06:41:25 +00:00			`if net.DEBUG:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`print('ox.cache.read_url', url)`
add possible validation option to getUrl 2009-06-01 13:11:40 +00:00			`#FIXME: send last-modified / etag from cache and only update if needed`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`#url = _fix_unicode_url(url)`
fix ox.cache.read_url 2012-08-17 20:20:35 +00:00			`result = store.get(url, data, headers, timeout)`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`url_headers = {}`
fix ox.cache.read_url 2012-08-17 20:20:35 +00:00			`if not result:`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`try:`
fix ox.cache.read_url 2012-08-17 20:20:35 +00:00			`url_headers, result = net.read_url(url, data, headers, return_headers=True)`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`except urllib.error.HTTPError as e:`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`e.headers['Status'] = "%s" % e.code`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`for key in e.headers:`
			`url_headers[key.lower()] = e.headers[key]`
fix ox.cache.read_url 2012-08-17 20:20:35 +00:00			`result = e.read()`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`if url_headers.get('content-encoding', None) == 'gzip':`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`result = gzip.GzipFile(fileobj=BytesIO(result)).read()`
fix ox.cache.read_url 2012-08-17 20:20:35 +00:00			`if not valid or valid(result, url_headers):`
			`store.set(url, post_data=data, data=result, headers=url_headers)`
add possible validation option to getUrl 2009-06-01 13:11:40 +00:00			`else:`
fix ox.cache.read_url 2012-08-17 20:20:35 +00:00			`raise InvalidResult(result, url_headers)`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`if unicode:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`ctype = url_headers.get('content-type', '').lower()`
			`if 'charset' in ctype:`
			`encoding = ctype.split('charset=')[-1]`
			`else:`
			`encoding = detect_encoding(result)`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`if not encoding:`
			`encoding = 'latin-1'`
fix ox.cache.read_url 2012-08-17 20:20:35 +00:00			`result = result.decode(encoding)`
			`return result`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00
			`def save_url(url, filename, overwrite=False):`
also include saveUrl in cache 2009-11-16 22:43:17 +00:00			`if not os.path.exists(filename) or overwrite:`
			`dirname = os.path.dirname(filename)`
			`if not os.path.exists(dirname):`
			`os.makedirs(dirname)`
convert some left over readUrls to read_url 2012-09-06 11:25:57 +00:00			`data = read_url(url)`
more file open py2/3 cleanups 2014-10-02 08:34:04 +00:00			`with open(filename, 'wb') as f:`
			`f.write(data)`
also include saveUrl in cache 2009-11-16 22:43:17 +00:00
add file cache 2011-11-01 12:55:49 +00:00			`def cache_path():`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))`
add some functions 2008-04-27 16:54:37 +00:00
add file cache 2011-11-01 12:55:49 +00:00			`class Cache:`
			`def __init__(self):`
			`pass`

			`def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):`
			`'''`
			`if value == 'data' return data of url if its in the cache else None`
			`if value == 'headers' return headers for url`
			`'''`
			`pass`

			`def set(self, url, post_data, data, headers):`
			`pass`

			`class SQLiteCache(Cache):`
			`def __init__(self):`
			`path = cache_path()`
			`if not os.path.exists(path):`
			`os.makedirs(path)`
			`self.db = os.path.join(path, "cache.sqlite")`
work around thread issues with ox.cache 2014-05-17 09:25:19 +00:00			`self.create()`
add file cache 2011-11-01 12:55:49 +00:00
			`def connect(self):`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`self.conn = sqlite3.connect(self.db, timeout=10)`
			`return self.conn`
add file cache 2011-11-01 12:55:49 +00:00
			`def create(self):`
work around thread issues with ox.cache 2014-05-17 09:25:19 +00:00			`conn = self.connect()`
			`c = conn.cursor()`
add file cache 2011-11-01 12:55:49 +00:00			`# Create table and indexes`
			`c.execute('''CREATE TABLE IF NOT EXISTS cache (url_hash varchar(42) unique, domain text, url text,`
			`post_data text, headers text, created int, data blob, only_headers int)''')`
			`c.execute('''CREATE INDEX IF NOT EXISTS cache_domain ON cache (domain)''')`
			`c.execute('''CREATE INDEX IF NOT EXISTS cache_url ON cache (url)''')`
			`c.execute('''CREATE INDEX IF NOT EXISTS cache_url_hash ON cache (url_hash)''')`

			`c.execute('''CREATE TABLE IF NOT EXISTS setting (key varchar(1024) unique, value text)''')`
			`if int(self.get_setting(c, 'version', 0)) < 1:`
			`self.set_setting(c, 'version', 1)`
			`c.execute('''ALTER TABLE cache ADD compressed INT DEFAULT 0''')`
work around thread issues with ox.cache 2014-05-17 09:25:19 +00:00			`conn.commit()`
add file cache 2011-11-01 12:55:49 +00:00
			`def get_setting(self, c, key, default=None):`
			`c.execute('SELECT value FROM setting WHERE key = ?', (key, ))`
			`for row in c:`
			`return row[0]`
			`return default`
use sqlite for cache 2009-08-20 17:27:11 +00:00
add file cache 2011-11-01 12:55:49 +00:00			`def set_setting(self, c, key, value):`
			`c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))`

			`def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):`
			`r = None`
			`if timeout == 0:`
			`return r`
			`if data:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()`
add file cache 2011-11-01 12:55:49 +00:00			`else:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()`
add file cache 2011-11-01 12:55:49 +00:00
work around thread issues with ox.cache 2014-05-17 09:25:19 +00:00			`conn = self.connect()`
			`c = conn.cursor()`
add file cache 2011-11-01 12:55:49 +00:00			`sql = 'SELECT %s, compressed FROM cache WHERE url_hash=?' % value`
			`if timeout > 0:`
			`now = time.mktime(time.localtime())`
			`t = (url_hash, now-timeout)`
			`sql += ' AND created > ?'`
			`else:`
			`t = (url_hash, )`
			`if value != "headers":`
			`sql += ' AND only_headers != 1 '`
			`c.execute(sql, t)`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`for row in c:`
add file cache 2011-11-01 12:55:49 +00:00			`r = row[0]`
			`if value == 'headers':`
			`r = json.loads(r)`
			`elif value == 'data':`
			`if row[1] == 1:`
			`r = zlib.decompress(r)`
more python3 cleanups 2014-10-02 08:28:22 +00:00			`elif PY2:`
add file cache 2011-11-01 12:55:49 +00:00			`r = str(r)`
			`break`

			`c.close()`
work around thread issues with ox.cache 2014-05-17 09:25:19 +00:00			`conn.close()`
add file cache 2011-11-01 12:55:49 +00:00			`return r`

			`def set(self, url, post_data, data, headers):`
			`if post_data:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()`
add file cache 2011-11-01 12:55:49 +00:00			`else:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()`
add file cache 2011-11-01 12:55:49 +00:00
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])`
add file cache 2011-11-01 12:55:49 +00:00
work around thread issues with ox.cache 2014-05-17 09:25:19 +00:00			`conn = self.connect()`
			`c = conn.cursor()`
add file cache 2011-11-01 12:55:49 +00:00
			`# Insert a row of data`
			`if not post_data: post_data=""`
			`only_headers = 0`
			`if data == -1:`
			`only_headers = 1`
			`data = ""`
			`created = time.mktime(time.localtime())`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`content_type = headers.get('content-type', '').split(';')[0].strip()`
			`if content_type in COMPRESS_TYPES:`
add file cache 2011-11-01 12:55:49 +00:00			`compressed = 1`
compress cache for html,text,css,js 2011-03-28 18:27:41 +00:00			`data = zlib.compress(data)`
add file cache 2011-11-01 12:55:49 +00:00			`else:`
			`compressed = 0`
			`data = sqlite3.Binary(data)`
handle broken headers 2014-04-23 13:38:38 +00:00
			`#fixme: this looks wrong`
			`try:`
			`_headers = json.dumps(headers)`
			`except:`
			`for h in headers:`
			`headers[h] = headers[h].decode(detect_encoding(headers[h]))`
			`_headers = json.dumps(headers)`
			`t = (url_hash, domain, url, post_data, _headers, created,`
add file cache 2011-11-01 12:55:49 +00:00			`data, only_headers, compressed)`
			`c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?, ?)""", t)`

			`# Save (commit) the changes and clean up`
work around thread issues with ox.cache 2014-05-17 09:25:19 +00:00			`conn.commit()`
add file cache 2011-11-01 12:55:49 +00:00			`c.close()`
work around thread issues with ox.cache 2014-05-17 09:25:19 +00:00			`conn.close()`
add file cache 2011-11-01 12:55:49 +00:00
			`class FileCache(Cache):`
			`def __init__(self):`
			`f, self.root = cache_path().split(':')`

			`def files(self, domain, h):`
			`prefix = os.path.join(self.root, domain, h[:2], h[2:4], h[4:6], h[6:8])`
			`i = os.path.join(prefix, '%s.json'%h)`
			`f = os.path.join(prefix, '%s.dat'%h)`
			`return prefix, i, f`

			`def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):`
			`r = None`
			`if timeout == 0:`
			`return r`

			`if data:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()`
add file cache 2011-11-01 12:55:49 +00:00			`else:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()`
add file cache 2011-11-01 12:55:49 +00:00
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])`
add file cache 2011-11-01 12:55:49 +00:00			`prefix, i, f = self.files(domain, url_hash)`
			`if os.path.exists(i):`
			`with open(i) as _i:`
ignore cache if not able to load json file 2013-01-31 14:59:21 +00:00			`try:`
			`info = json.load(_i)`
			`except:`
			`return r`
add file cache 2011-11-01 12:55:49 +00:00			`now = time.mktime(time.localtime())`
			`expired = now-timeout`

			`if value != 'headers' and info['only_headers']:`
			`return None`
			`if timeout < 0 or info['created'] > expired:`
			`if value == 'headers':`
			`r = info['headers']`
			`else:`
more file open py2/3 cleanups 2014-10-02 08:34:04 +00:00			`with open(f, 'rb') as data:`
add file cache 2011-11-01 12:55:49 +00:00			`r = data.read()`
			`if info['compressed']:`
			`r = zlib.decompress(r)`
			`return r`

			`def set(self, url, post_data, data, headers):`
			`if post_data:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()`
add file cache 2011-11-01 12:55:49 +00:00			`else:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()`
add file cache 2011-11-01 12:55:49 +00:00
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])`
add file cache 2011-11-01 12:55:49 +00:00			`prefix, i, f = self.files(domain, url_hash)`
			`makedirs(prefix)`

			`created = time.mktime(time.localtime())`
			`content_type = headers.get('content-type', '').split(';')[0].strip()`

			`info = {`
			`'compressed': content_type in COMPRESS_TYPES,`
			`'only_headers': data == -1,`
			`'created': created,`
			`'headers': headers,`
avoid variable collision, save url 2011-11-01 21:53:36 +00:00			`'url': url,`
add file cache 2011-11-01 12:55:49 +00:00			`}`
			`if post_data:`
			`info['post_data'] = post_data`
			`if not info['only_headers']:`
			`if info['compressed']:`
			`data = zlib.compress(data)`
more file open py2/3 cleanups 2014-10-02 08:34:04 +00:00			`elif not isinstance(data, str):`
			`data = data.encode('utf-8')`
			`with open(f, 'wb') as _f:`
add file cache 2011-11-01 12:55:49 +00:00			`_f.write(data)`
more file open py2/3 cleanups 2014-10-02 08:34:04 +00:00			`with open(i, 'wb') as _i:`
add file cache 2011-11-01 12:55:49 +00:00			`json.dump(info, _i)`

			`if cache_path().startswith('fs:'):`
			`store = FileCache()`
			`else:`
			`store = SQLiteCache()`