avoid dict as default value
This commit is contained in:
parent
170af83272
commit
4e7898ae57
2 changed files with 64 additions and 60 deletions
35
ox/cache.py
35
ox/cache.py
|
@ -4,14 +4,15 @@
|
|||
from __future__ import with_statement, print_function
|
||||
|
||||
import gzip
|
||||
import zlib
|
||||
import hashlib
|
||||
import os
|
||||
from six import BytesIO
|
||||
import sqlite3
|
||||
import time
|
||||
import zlib
|
||||
|
||||
from six import BytesIO
|
||||
from six.moves import urllib
|
||||
from six import PY2
|
||||
import sqlite3
|
||||
|
||||
from .utils import json
|
||||
from .file import makedirs
|
||||
|
@ -19,6 +20,7 @@ from .file import makedirs
|
|||
from . import net
|
||||
from .net import DEFAULT_HEADERS, detect_encoding
|
||||
|
||||
|
||||
cache_timeout = 30*24*60*60 # default is 30 days
|
||||
|
||||
COMPRESS_TYPES = (
|
||||
|
@ -33,7 +35,7 @@ COMPRESS_TYPES = (
|
|||
'application/rss+xml'
|
||||
)
|
||||
|
||||
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
def status(url, data=None, headers=None, timeout=cache_timeout):
|
||||
'''
|
||||
>>> status('http://google.com')
|
||||
200
|
||||
|
@ -43,7 +45,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
|||
headers = get_headers(url, data, headers)
|
||||
return int(headers['status'])
|
||||
|
||||
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
def exists(url, data=None, headers=None, timeout=cache_timeout):
|
||||
'''
|
||||
>>> exists('http://google.com')
|
||||
True
|
||||
|
@ -55,14 +57,14 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
|||
return True
|
||||
return False
|
||||
|
||||
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
def get_headers(url, data=None, headers=None, timeout=cache_timeout):
|
||||
url_headers = store.get(url, data, headers, timeout, "headers")
|
||||
if not url_headers:
|
||||
url_headers = net.get_headers(url, data, headers)
|
||||
store.set(url, data, -1, url_headers)
|
||||
return url_headers
|
||||
|
||||
def get_json(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
def get_json(url, data=None, headers=None, timeout=cache_timeout):
|
||||
return json.loads(read_url(url, data, headers, timeout).decode('utf-8'))
|
||||
|
||||
class InvalidResult(Exception):
|
||||
|
@ -76,7 +78,7 @@ def _fix_unicode_url(url):
|
|||
url = url.encode('utf-8')
|
||||
return url
|
||||
|
||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
|
||||
def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, unicode=False):
|
||||
'''
|
||||
url - url to load
|
||||
data - possible post data
|
||||
|
@ -134,7 +136,7 @@ class Cache:
|
|||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
||||
def get(self, url, data, headers=None, timeout=-1, value="data"):
|
||||
'''
|
||||
if value == 'data' return data of url if its in the cache else None
|
||||
if value == 'headers' return headers for url
|
||||
|
@ -192,7 +194,7 @@ class SQLiteCache(Cache):
|
|||
def set_setting(self, c, key, value):
|
||||
c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))
|
||||
|
||||
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
||||
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
|
||||
r = None
|
||||
if timeout == 0:
|
||||
return r
|
||||
|
@ -225,7 +227,7 @@ class SQLiteCache(Cache):
|
|||
conn.close()
|
||||
return r
|
||||
|
||||
def delete(self, url, data=None, headers=DEFAULT_HEADERS):
|
||||
def delete(self, url, data=None, headers=None):
|
||||
url_hash = self.get_url_hash(url, data)
|
||||
conn = self.connect()
|
||||
c = conn.cursor()
|
||||
|
@ -244,7 +246,8 @@ class SQLiteCache(Cache):
|
|||
c = conn.cursor()
|
||||
|
||||
# Insert a row of data
|
||||
if not post_data: post_data=""
|
||||
if not post_data:
|
||||
post_data = ""
|
||||
only_headers = 0
|
||||
if data == -1:
|
||||
only_headers = 1
|
||||
|
@ -284,7 +287,7 @@ class FileCache(Cache):
|
|||
f = os.path.join(prefix, '%s.dat'%h)
|
||||
return prefix, i, f
|
||||
|
||||
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
||||
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
|
||||
r = None
|
||||
if timeout == 0:
|
||||
return r
|
||||
|
@ -308,13 +311,13 @@ class FileCache(Cache):
|
|||
if value == 'headers':
|
||||
r = info['headers']
|
||||
else:
|
||||
with open(f, 'rb') as data:
|
||||
r = data.read()
|
||||
with open(f, 'rb') as fd:
|
||||
r = fd.read()
|
||||
if info['compressed']:
|
||||
r = zlib.decompress(r)
|
||||
return r
|
||||
|
||||
def delete(self, url, data=None, headers=DEFAULT_HEADERS):
|
||||
def delete(self, url, data=None, headers=None):
|
||||
url_hash = self.get_url_hash(url, data)
|
||||
domain = self.get_domain(url)
|
||||
|
||||
|
|
51
ox/net.py
51
ox/net.py
|
@ -23,7 +23,7 @@ DEFAULT_HEADERS = {
|
|||
'Accept-Encoding': 'gzip'
|
||||
}
|
||||
|
||||
def status(url, data=None, headers=DEFAULT_HEADERS):
|
||||
def status(url, data=None, headers=None):
|
||||
try:
|
||||
f = open_url(url, data, headers)
|
||||
s = f.code
|
||||
|
@ -31,13 +31,13 @@ def status(url, data=None, headers=DEFAULT_HEADERS):
|
|||
s = e.code
|
||||
return s
|
||||
|
||||
def exists(url, data=None, headers=DEFAULT_HEADERS):
|
||||
def exists(url, data=None, headers=None):
|
||||
s = status(url, data, headers)
|
||||
if s >= 200 and s < 400:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_headers(url, data=None, headers=DEFAULT_HEADERS):
|
||||
def get_headers(url, data=None, headers=None):
|
||||
try:
|
||||
f = open_url(url, data, headers)
|
||||
f.headers['Status'] = "%s" % f.code
|
||||
|
@ -48,10 +48,12 @@ def get_headers(url, data=None, headers=DEFAULT_HEADERS):
|
|||
headers = e.headers
|
||||
return dict(headers)
|
||||
|
||||
def get_json(url, data=None, headers=DEFAULT_HEADERS):
|
||||
return json.loads(read_url(url, data, headers).decode('utf-8'))
|
||||
def get_json(url, data=None, headers=None):
|
||||
return json.loads(read_url(url, data, headers).decode('utf-8')) # pylint: disable=no-member
|
||||
|
||||
def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
||||
def open_url(url, data=None, headers=None):
|
||||
if headers is None:
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
if PY2:
|
||||
if not isinstance(url, bytes):
|
||||
url = url.encode('utf-8')
|
||||
|
@ -64,7 +66,7 @@ def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
|||
req = urllib.request.Request(url, data, headers)
|
||||
return urllib.request.urlopen(req)
|
||||
|
||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
|
||||
def read_url(url, data=None, headers=None, return_headers=False, unicode=False):
|
||||
if DEBUG:
|
||||
print('ox.net.read_url', url)
|
||||
f = open_url(url, data, headers)
|
||||
|
@ -119,51 +121,50 @@ def save_url(url, filename, overwrite=False):
|
|||
with open(filename, 'wb') as f:
|
||||
f.write(data)
|
||||
|
||||
def oshash(url):
|
||||
def get_size(url):
|
||||
def _get_size(url):
|
||||
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
|
||||
req.get_method = lambda: 'HEAD'
|
||||
u = urllib.request.urlopen(req)
|
||||
if u.code != 200 or not 'Content-Length' in u.headers:
|
||||
if u.code != 200 or 'Content-Length' not in u.headers:
|
||||
raise IOError
|
||||
return int(u.headers['Content-Length'])
|
||||
|
||||
def get_range(url, start, end):
|
||||
def _get_range(url, start, end):
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
headers['Range'] = 'bytes=%s-%s' % (start, end)
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
u = urllib.request.urlopen(req)
|
||||
return u.read()
|
||||
|
||||
def oshash(url):
|
||||
try:
|
||||
longlongformat = 'q' # long long
|
||||
bytesize = struct.calcsize(longlongformat)
|
||||
|
||||
filesize = get_size(url)
|
||||
hash = filesize
|
||||
head = get_range(url, 0, min(filesize, 65536))
|
||||
filesize = _get_size(url)
|
||||
hash_ = filesize
|
||||
head = _get_range(url, 0, min(filesize, 65536))
|
||||
if filesize > 65536:
|
||||
tail = get_range(url, filesize-65536, filesize)
|
||||
tail = _get_range(url, filesize-65536, filesize)
|
||||
if filesize < 65536:
|
||||
f = BytesIO(head)
|
||||
for x in range(int(filesize/bytesize)):
|
||||
for _ in range(int(filesize/bytesize)):
|
||||
buffer = f.read(bytesize)
|
||||
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF #cut off 64bit overflow
|
||||
hash_ += l_value
|
||||
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut off 64bit overflow
|
||||
else:
|
||||
for offset in range(0, 65536, bytesize):
|
||||
buffer = head[offset:offset+bytesize]
|
||||
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF #cut of 64bit overflow
|
||||
hash_ += l_value
|
||||
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut of 64bit overflow
|
||||
for offset in range(0, 65536, bytesize):
|
||||
buffer = tail[offset:offset+bytesize]
|
||||
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF
|
||||
returnedhash = "%016x" % hash
|
||||
hash_ += l_value
|
||||
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF
|
||||
returnedhash = "%016x" % hash_
|
||||
return returnedhash
|
||||
except(IOError):
|
||||
except IOError:
|
||||
return "IOError"
|
||||
|
||||
|
|
Loading…
Reference in a new issue