avoid dict as default value
This commit is contained in:
parent
170af83272
commit
4e7898ae57
2 changed files with 64 additions and 60 deletions
43
ox/cache.py
43
ox/cache.py
|
@ -4,14 +4,15 @@
|
||||||
from __future__ import with_statement, print_function
|
from __future__ import with_statement, print_function
|
||||||
|
|
||||||
import gzip
|
import gzip
|
||||||
import zlib
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
from six import BytesIO
|
import sqlite3
|
||||||
import time
|
import time
|
||||||
|
import zlib
|
||||||
|
|
||||||
|
from six import BytesIO
|
||||||
from six.moves import urllib
|
from six.moves import urllib
|
||||||
from six import PY2
|
from six import PY2
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
from .utils import json
|
from .utils import json
|
||||||
from .file import makedirs
|
from .file import makedirs
|
||||||
|
@ -19,7 +20,8 @@ from .file import makedirs
|
||||||
from . import net
|
from . import net
|
||||||
from .net import DEFAULT_HEADERS, detect_encoding
|
from .net import DEFAULT_HEADERS, detect_encoding
|
||||||
|
|
||||||
cache_timeout = 30*24*60*60 # default is 30 days
|
|
||||||
|
cache_timeout = 30*24*60*60 # default is 30 days
|
||||||
|
|
||||||
COMPRESS_TYPES = (
|
COMPRESS_TYPES = (
|
||||||
'text/html',
|
'text/html',
|
||||||
|
@ -33,7 +35,7 @@ COMPRESS_TYPES = (
|
||||||
'application/rss+xml'
|
'application/rss+xml'
|
||||||
)
|
)
|
||||||
|
|
||||||
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
def status(url, data=None, headers=None, timeout=cache_timeout):
|
||||||
'''
|
'''
|
||||||
>>> status('http://google.com')
|
>>> status('http://google.com')
|
||||||
200
|
200
|
||||||
|
@ -43,7 +45,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||||
headers = get_headers(url, data, headers)
|
headers = get_headers(url, data, headers)
|
||||||
return int(headers['status'])
|
return int(headers['status'])
|
||||||
|
|
||||||
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
def exists(url, data=None, headers=None, timeout=cache_timeout):
|
||||||
'''
|
'''
|
||||||
>>> exists('http://google.com')
|
>>> exists('http://google.com')
|
||||||
True
|
True
|
||||||
|
@ -55,14 +57,14 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
def get_headers(url, data=None, headers=None, timeout=cache_timeout):
|
||||||
url_headers = store.get(url, data, headers, timeout, "headers")
|
url_headers = store.get(url, data, headers, timeout, "headers")
|
||||||
if not url_headers:
|
if not url_headers:
|
||||||
url_headers = net.get_headers(url, data, headers)
|
url_headers = net.get_headers(url, data, headers)
|
||||||
store.set(url, data, -1, url_headers)
|
store.set(url, data, -1, url_headers)
|
||||||
return url_headers
|
return url_headers
|
||||||
|
|
||||||
def get_json(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
def get_json(url, data=None, headers=None, timeout=cache_timeout):
|
||||||
return json.loads(read_url(url, data, headers, timeout).decode('utf-8'))
|
return json.loads(read_url(url, data, headers, timeout).decode('utf-8'))
|
||||||
|
|
||||||
class InvalidResult(Exception):
|
class InvalidResult(Exception):
|
||||||
|
@ -76,7 +78,7 @@ def _fix_unicode_url(url):
|
||||||
url = url.encode('utf-8')
|
url = url.encode('utf-8')
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
|
def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, unicode=False):
|
||||||
'''
|
'''
|
||||||
url - url to load
|
url - url to load
|
||||||
data - possible post data
|
data - possible post data
|
||||||
|
@ -87,8 +89,8 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
|
||||||
'''
|
'''
|
||||||
if net.DEBUG:
|
if net.DEBUG:
|
||||||
print('ox.cache.read_url', url)
|
print('ox.cache.read_url', url)
|
||||||
#FIXME: send last-modified / etag from cache and only update if needed
|
# FIXME: send last-modified / etag from cache and only update if needed
|
||||||
#url = _fix_unicode_url(url)
|
# url = _fix_unicode_url(url)
|
||||||
result = store.get(url, data, headers, timeout)
|
result = store.get(url, data, headers, timeout)
|
||||||
url_headers = {}
|
url_headers = {}
|
||||||
if not result:
|
if not result:
|
||||||
|
@ -116,7 +118,7 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
|
||||||
result = result.decode(encoding)
|
result = result.decode(encoding)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
get_url=read_url
|
get_url = read_url
|
||||||
|
|
||||||
def save_url(url, filename, overwrite=False):
|
def save_url(url, filename, overwrite=False):
|
||||||
if not os.path.exists(filename) or overwrite:
|
if not os.path.exists(filename) or overwrite:
|
||||||
|
@ -134,7 +136,7 @@ class Cache:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
def get(self, url, data, headers=None, timeout=-1, value="data"):
|
||||||
'''
|
'''
|
||||||
if value == 'data' return data of url if its in the cache else None
|
if value == 'data' return data of url if its in the cache else None
|
||||||
if value == 'headers' return headers for url
|
if value == 'headers' return headers for url
|
||||||
|
@ -192,7 +194,7 @@ class SQLiteCache(Cache):
|
||||||
def set_setting(self, c, key, value):
|
def set_setting(self, c, key, value):
|
||||||
c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))
|
c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))
|
||||||
|
|
||||||
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
|
||||||
r = None
|
r = None
|
||||||
if timeout == 0:
|
if timeout == 0:
|
||||||
return r
|
return r
|
||||||
|
@ -225,7 +227,7 @@ class SQLiteCache(Cache):
|
||||||
conn.close()
|
conn.close()
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def delete(self, url, data=None, headers=DEFAULT_HEADERS):
|
def delete(self, url, data=None, headers=None):
|
||||||
url_hash = self.get_url_hash(url, data)
|
url_hash = self.get_url_hash(url, data)
|
||||||
conn = self.connect()
|
conn = self.connect()
|
||||||
c = conn.cursor()
|
c = conn.cursor()
|
||||||
|
@ -244,7 +246,8 @@ class SQLiteCache(Cache):
|
||||||
c = conn.cursor()
|
c = conn.cursor()
|
||||||
|
|
||||||
# Insert a row of data
|
# Insert a row of data
|
||||||
if not post_data: post_data=""
|
if not post_data:
|
||||||
|
post_data = ""
|
||||||
only_headers = 0
|
only_headers = 0
|
||||||
if data == -1:
|
if data == -1:
|
||||||
only_headers = 1
|
only_headers = 1
|
||||||
|
@ -284,7 +287,7 @@ class FileCache(Cache):
|
||||||
f = os.path.join(prefix, '%s.dat'%h)
|
f = os.path.join(prefix, '%s.dat'%h)
|
||||||
return prefix, i, f
|
return prefix, i, f
|
||||||
|
|
||||||
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
|
||||||
r = None
|
r = None
|
||||||
if timeout == 0:
|
if timeout == 0:
|
||||||
return r
|
return r
|
||||||
|
@ -308,13 +311,13 @@ class FileCache(Cache):
|
||||||
if value == 'headers':
|
if value == 'headers':
|
||||||
r = info['headers']
|
r = info['headers']
|
||||||
else:
|
else:
|
||||||
with open(f, 'rb') as data:
|
with open(f, 'rb') as fd:
|
||||||
r = data.read()
|
r = fd.read()
|
||||||
if info['compressed']:
|
if info['compressed']:
|
||||||
r = zlib.decompress(r)
|
r = zlib.decompress(r)
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def delete(self, url, data=None, headers=DEFAULT_HEADERS):
|
def delete(self, url, data=None, headers=None):
|
||||||
url_hash = self.get_url_hash(url, data)
|
url_hash = self.get_url_hash(url, data)
|
||||||
domain = self.get_domain(url)
|
domain = self.get_domain(url)
|
||||||
|
|
||||||
|
|
81
ox/net.py
81
ox/net.py
|
@ -23,7 +23,7 @@ DEFAULT_HEADERS = {
|
||||||
'Accept-Encoding': 'gzip'
|
'Accept-Encoding': 'gzip'
|
||||||
}
|
}
|
||||||
|
|
||||||
def status(url, data=None, headers=DEFAULT_HEADERS):
|
def status(url, data=None, headers=None):
|
||||||
try:
|
try:
|
||||||
f = open_url(url, data, headers)
|
f = open_url(url, data, headers)
|
||||||
s = f.code
|
s = f.code
|
||||||
|
@ -31,13 +31,13 @@ def status(url, data=None, headers=DEFAULT_HEADERS):
|
||||||
s = e.code
|
s = e.code
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def exists(url, data=None, headers=DEFAULT_HEADERS):
|
def exists(url, data=None, headers=None):
|
||||||
s = status(url, data, headers)
|
s = status(url, data, headers)
|
||||||
if s >= 200 and s < 400:
|
if s >= 200 and s < 400:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_headers(url, data=None, headers=DEFAULT_HEADERS):
|
def get_headers(url, data=None, headers=None):
|
||||||
try:
|
try:
|
||||||
f = open_url(url, data, headers)
|
f = open_url(url, data, headers)
|
||||||
f.headers['Status'] = "%s" % f.code
|
f.headers['Status'] = "%s" % f.code
|
||||||
|
@ -48,10 +48,12 @@ def get_headers(url, data=None, headers=DEFAULT_HEADERS):
|
||||||
headers = e.headers
|
headers = e.headers
|
||||||
return dict(headers)
|
return dict(headers)
|
||||||
|
|
||||||
def get_json(url, data=None, headers=DEFAULT_HEADERS):
|
def get_json(url, data=None, headers=None):
|
||||||
return json.loads(read_url(url, data, headers).decode('utf-8'))
|
return json.loads(read_url(url, data, headers).decode('utf-8')) # pylint: disable=no-member
|
||||||
|
|
||||||
def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
def open_url(url, data=None, headers=None):
|
||||||
|
if headers is None:
|
||||||
|
headers = DEFAULT_HEADERS.copy()
|
||||||
if PY2:
|
if PY2:
|
||||||
if not isinstance(url, bytes):
|
if not isinstance(url, bytes):
|
||||||
url = url.encode('utf-8')
|
url = url.encode('utf-8')
|
||||||
|
@ -64,7 +66,7 @@ def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
||||||
req = urllib.request.Request(url, data, headers)
|
req = urllib.request.Request(url, data, headers)
|
||||||
return urllib.request.urlopen(req)
|
return urllib.request.urlopen(req)
|
||||||
|
|
||||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
|
def read_url(url, data=None, headers=None, return_headers=False, unicode=False):
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
print('ox.net.read_url', url)
|
print('ox.net.read_url', url)
|
||||||
f = open_url(url, data, headers)
|
f = open_url(url, data, headers)
|
||||||
|
@ -108,7 +110,7 @@ def detect_encoding(data):
|
||||||
detector.close()
|
detector.close()
|
||||||
return detector.result['encoding']
|
return detector.result['encoding']
|
||||||
|
|
||||||
get_url=read_url
|
get_url = read_url
|
||||||
|
|
||||||
def save_url(url, filename, overwrite=False):
|
def save_url(url, filename, overwrite=False):
|
||||||
if not os.path.exists(filename) or overwrite:
|
if not os.path.exists(filename) or overwrite:
|
||||||
|
@ -119,51 +121,50 @@ def save_url(url, filename, overwrite=False):
|
||||||
with open(filename, 'wb') as f:
|
with open(filename, 'wb') as f:
|
||||||
f.write(data)
|
f.write(data)
|
||||||
|
|
||||||
|
def _get_size(url):
|
||||||
|
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
|
||||||
|
req.get_method = lambda: 'HEAD'
|
||||||
|
u = urllib.request.urlopen(req)
|
||||||
|
if u.code != 200 or 'Content-Length' not in u.headers:
|
||||||
|
raise IOError
|
||||||
|
return int(u.headers['Content-Length'])
|
||||||
|
|
||||||
|
def _get_range(url, start, end):
|
||||||
|
headers = DEFAULT_HEADERS.copy()
|
||||||
|
headers['Range'] = 'bytes=%s-%s' % (start, end)
|
||||||
|
req = urllib.request.Request(url, headers=headers)
|
||||||
|
u = urllib.request.urlopen(req)
|
||||||
|
return u.read()
|
||||||
|
|
||||||
def oshash(url):
|
def oshash(url):
|
||||||
def get_size(url):
|
|
||||||
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
|
|
||||||
req.get_method = lambda : 'HEAD'
|
|
||||||
u = urllib.request.urlopen(req)
|
|
||||||
if u.code != 200 or not 'Content-Length' in u.headers:
|
|
||||||
raise IOError
|
|
||||||
return int(u.headers['Content-Length'])
|
|
||||||
|
|
||||||
def get_range(url, start, end):
|
|
||||||
headers = DEFAULT_HEADERS.copy()
|
|
||||||
headers['Range'] = 'bytes=%s-%s' % (start, end)
|
|
||||||
req = urllib.request.Request(url, headers=headers)
|
|
||||||
u = urllib.request.urlopen(req)
|
|
||||||
return u.read()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
longlongformat = 'q' # long long
|
longlongformat = 'q' # long long
|
||||||
bytesize = struct.calcsize(longlongformat)
|
bytesize = struct.calcsize(longlongformat)
|
||||||
|
|
||||||
filesize = get_size(url)
|
filesize = _get_size(url)
|
||||||
hash = filesize
|
hash_ = filesize
|
||||||
head = get_range(url, 0, min(filesize, 65536))
|
head = _get_range(url, 0, min(filesize, 65536))
|
||||||
if filesize > 65536:
|
if filesize > 65536:
|
||||||
tail = get_range(url, filesize-65536, filesize)
|
tail = _get_range(url, filesize-65536, filesize)
|
||||||
if filesize < 65536:
|
if filesize < 65536:
|
||||||
f = BytesIO(head)
|
f = BytesIO(head)
|
||||||
for x in range(int(filesize/bytesize)):
|
for _ in range(int(filesize/bytesize)):
|
||||||
buffer = f.read(bytesize)
|
buffer = f.read(bytesize)
|
||||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||||
hash += l_value
|
hash_ += l_value
|
||||||
hash = hash & 0xFFFFFFFFFFFFFFFF #cut off 64bit overflow
|
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut off 64bit overflow
|
||||||
else:
|
else:
|
||||||
for offset in range(0, 65536, bytesize):
|
for offset in range(0, 65536, bytesize):
|
||||||
buffer = head[offset:offset+bytesize]
|
buffer = head[offset:offset+bytesize]
|
||||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||||
hash += l_value
|
hash_ += l_value
|
||||||
hash = hash & 0xFFFFFFFFFFFFFFFF #cut of 64bit overflow
|
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut of 64bit overflow
|
||||||
for offset in range(0, 65536, bytesize):
|
for offset in range(0, 65536, bytesize):
|
||||||
buffer = tail[offset:offset+bytesize]
|
buffer = tail[offset:offset+bytesize]
|
||||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||||
hash += l_value
|
hash_ += l_value
|
||||||
hash = hash & 0xFFFFFFFFFFFFFFFF
|
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF
|
||||||
returnedhash = "%016x" % hash
|
returnedhash = "%016x" % hash_
|
||||||
return returnedhash
|
return returnedhash
|
||||||
except(IOError):
|
except IOError:
|
||||||
return "IOError"
|
return "IOError"
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue