avoid dict as default value

This commit is contained in:
j 2016-06-08 15:30:25 +02:00
parent 170af83272
commit 4e7898ae57
2 changed files with 64 additions and 60 deletions

View file

@ -4,14 +4,15 @@
from __future__ import with_statement, print_function
import gzip
import zlib
import hashlib
import os
from six import BytesIO
import sqlite3
import time
import zlib
from six import BytesIO
from six.moves import urllib
from six import PY2
import sqlite3
from .utils import json
from .file import makedirs
@ -19,6 +20,7 @@ from .file import makedirs
from . import net
from .net import DEFAULT_HEADERS, detect_encoding
cache_timeout = 30*24*60*60 # default is 30 days
COMPRESS_TYPES = (
@ -33,7 +35,7 @@ COMPRESS_TYPES = (
'application/rss+xml'
)
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
def status(url, data=None, headers=None, timeout=cache_timeout):
'''
>>> status('http://google.com')
200
@ -43,7 +45,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
headers = get_headers(url, data, headers)
return int(headers['status'])
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
def exists(url, data=None, headers=None, timeout=cache_timeout):
'''
>>> exists('http://google.com')
True
@ -55,14 +57,14 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
return True
return False
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
def get_headers(url, data=None, headers=None, timeout=cache_timeout):
url_headers = store.get(url, data, headers, timeout, "headers")
if not url_headers:
url_headers = net.get_headers(url, data, headers)
store.set(url, data, -1, url_headers)
return url_headers
def get_json(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
def get_json(url, data=None, headers=None, timeout=cache_timeout):
return json.loads(read_url(url, data, headers, timeout).decode('utf-8'))
class InvalidResult(Exception):
@ -76,7 +78,7 @@ def _fix_unicode_url(url):
url = url.encode('utf-8')
return url
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, unicode=False):
'''
url - url to load
data - possible post data
@ -87,8 +89,8 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
'''
if net.DEBUG:
print('ox.cache.read_url', url)
#FIXME: send last-modified / etag from cache and only update if needed
#url = _fix_unicode_url(url)
# FIXME: send last-modified / etag from cache and only update if needed
# url = _fix_unicode_url(url)
result = store.get(url, data, headers, timeout)
url_headers = {}
if not result:
@ -116,7 +118,7 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
result = result.decode(encoding)
return result
get_url=read_url
get_url = read_url
def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
@ -134,7 +136,7 @@ class Cache:
def __init__(self):
pass
def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
def get(self, url, data, headers=None, timeout=-1, value="data"):
'''
if value == 'data' return data of url if its in the cache else None
if value == 'headers' return headers for url
@ -192,7 +194,7 @@ class SQLiteCache(Cache):
def set_setting(self, c, key, value):
c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
r = None
if timeout == 0:
return r
@ -225,7 +227,7 @@ class SQLiteCache(Cache):
conn.close()
return r
def delete(self, url, data=None, headers=DEFAULT_HEADERS):
def delete(self, url, data=None, headers=None):
url_hash = self.get_url_hash(url, data)
conn = self.connect()
c = conn.cursor()
@ -244,7 +246,8 @@ class SQLiteCache(Cache):
c = conn.cursor()
# Insert a row of data
if not post_data: post_data=""
if not post_data:
post_data = ""
only_headers = 0
if data == -1:
only_headers = 1
@ -284,7 +287,7 @@ class FileCache(Cache):
f = os.path.join(prefix, '%s.dat'%h)
return prefix, i, f
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
r = None
if timeout == 0:
return r
@ -308,13 +311,13 @@ class FileCache(Cache):
if value == 'headers':
r = info['headers']
else:
with open(f, 'rb') as data:
r = data.read()
with open(f, 'rb') as fd:
r = fd.read()
if info['compressed']:
r = zlib.decompress(r)
return r
def delete(self, url, data=None, headers=DEFAULT_HEADERS):
def delete(self, url, data=None, headers=None):
url_hash = self.get_url_hash(url, data)
domain = self.get_domain(url)

View file

@ -23,7 +23,7 @@ DEFAULT_HEADERS = {
'Accept-Encoding': 'gzip'
}
def status(url, data=None, headers=DEFAULT_HEADERS):
def status(url, data=None, headers=None):
try:
f = open_url(url, data, headers)
s = f.code
@ -31,13 +31,13 @@ def status(url, data=None, headers=DEFAULT_HEADERS):
s = e.code
return s
def exists(url, data=None, headers=DEFAULT_HEADERS):
def exists(url, data=None, headers=None):
s = status(url, data, headers)
if s >= 200 and s < 400:
return True
return False
def get_headers(url, data=None, headers=DEFAULT_HEADERS):
def get_headers(url, data=None, headers=None):
try:
f = open_url(url, data, headers)
f.headers['Status'] = "%s" % f.code
@ -48,10 +48,12 @@ def get_headers(url, data=None, headers=DEFAULT_HEADERS):
headers = e.headers
return dict(headers)
def get_json(url, data=None, headers=DEFAULT_HEADERS):
return json.loads(read_url(url, data, headers).decode('utf-8'))
def get_json(url, data=None, headers=None):
return json.loads(read_url(url, data, headers).decode('utf-8')) # pylint: disable=no-member
def open_url(url, data=None, headers=DEFAULT_HEADERS):
def open_url(url, data=None, headers=None):
if headers is None:
headers = DEFAULT_HEADERS.copy()
if PY2:
if not isinstance(url, bytes):
url = url.encode('utf-8')
@ -64,7 +66,7 @@ def open_url(url, data=None, headers=DEFAULT_HEADERS):
req = urllib.request.Request(url, data, headers)
return urllib.request.urlopen(req)
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
def read_url(url, data=None, headers=None, return_headers=False, unicode=False):
if DEBUG:
print('ox.net.read_url', url)
f = open_url(url, data, headers)
@ -108,7 +110,7 @@ def detect_encoding(data):
detector.close()
return detector.result['encoding']
get_url=read_url
get_url = read_url
def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
@ -119,51 +121,50 @@ def save_url(url, filename, overwrite=False):
with open(filename, 'wb') as f:
f.write(data)
def oshash(url):
def get_size(url):
def _get_size(url):
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
req.get_method = lambda : 'HEAD'
req.get_method = lambda: 'HEAD'
u = urllib.request.urlopen(req)
if u.code != 200 or not 'Content-Length' in u.headers:
if u.code != 200 or 'Content-Length' not in u.headers:
raise IOError
return int(u.headers['Content-Length'])
def get_range(url, start, end):
def _get_range(url, start, end):
headers = DEFAULT_HEADERS.copy()
headers['Range'] = 'bytes=%s-%s' % (start, end)
req = urllib.request.Request(url, headers=headers)
u = urllib.request.urlopen(req)
return u.read()
def oshash(url):
try:
longlongformat = 'q' # long long
bytesize = struct.calcsize(longlongformat)
filesize = get_size(url)
hash = filesize
head = get_range(url, 0, min(filesize, 65536))
filesize = _get_size(url)
hash_ = filesize
head = _get_range(url, 0, min(filesize, 65536))
if filesize > 65536:
tail = get_range(url, filesize-65536, filesize)
tail = _get_range(url, filesize-65536, filesize)
if filesize < 65536:
f = BytesIO(head)
for x in range(int(filesize/bytesize)):
for _ in range(int(filesize/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #cut off 64bit overflow
(l_value,) = struct.unpack(longlongformat, buffer)
hash_ += l_value
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut off 64bit overflow
else:
for offset in range(0, 65536, bytesize):
buffer = head[offset:offset+bytesize]
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #cut of 64bit overflow
(l_value,) = struct.unpack(longlongformat, buffer)
hash_ += l_value
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut of 64bit overflow
for offset in range(0, 65536, bytesize):
buffer = tail[offset:offset+bytesize]
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF
returnedhash = "%016x" % hash
(l_value,) = struct.unpack(longlongformat, buffer)
hash_ += l_value
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF
returnedhash = "%016x" % hash_
return returnedhash
except(IOError):
except IOError:
return "IOError"