avoid dict as default value

This commit is contained in:
j 2016-06-08 15:30:25 +02:00
parent 170af83272
commit 4e7898ae57
2 changed files with 64 additions and 60 deletions

View file

@ -4,14 +4,15 @@
from __future__ import with_statement, print_function from __future__ import with_statement, print_function
import gzip import gzip
import zlib
import hashlib import hashlib
import os import os
from six import BytesIO import sqlite3
import time import time
import zlib
from six import BytesIO
from six.moves import urllib from six.moves import urllib
from six import PY2 from six import PY2
import sqlite3
from .utils import json from .utils import json
from .file import makedirs from .file import makedirs
@ -19,7 +20,8 @@ from .file import makedirs
from . import net from . import net
from .net import DEFAULT_HEADERS, detect_encoding from .net import DEFAULT_HEADERS, detect_encoding
cache_timeout = 30*24*60*60 # default is 30 days
cache_timeout = 30*24*60*60 # default is 30 days
COMPRESS_TYPES = ( COMPRESS_TYPES = (
'text/html', 'text/html',
@ -33,7 +35,7 @@ COMPRESS_TYPES = (
'application/rss+xml' 'application/rss+xml'
) )
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): def status(url, data=None, headers=None, timeout=cache_timeout):
''' '''
>>> status('http://google.com') >>> status('http://google.com')
200 200
@ -43,7 +45,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
headers = get_headers(url, data, headers) headers = get_headers(url, data, headers)
return int(headers['status']) return int(headers['status'])
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): def exists(url, data=None, headers=None, timeout=cache_timeout):
''' '''
>>> exists('http://google.com') >>> exists('http://google.com')
True True
@ -55,14 +57,14 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
return True return True
return False return False
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): def get_headers(url, data=None, headers=None, timeout=cache_timeout):
url_headers = store.get(url, data, headers, timeout, "headers") url_headers = store.get(url, data, headers, timeout, "headers")
if not url_headers: if not url_headers:
url_headers = net.get_headers(url, data, headers) url_headers = net.get_headers(url, data, headers)
store.set(url, data, -1, url_headers) store.set(url, data, -1, url_headers)
return url_headers return url_headers
def get_json(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): def get_json(url, data=None, headers=None, timeout=cache_timeout):
return json.loads(read_url(url, data, headers, timeout).decode('utf-8')) return json.loads(read_url(url, data, headers, timeout).decode('utf-8'))
class InvalidResult(Exception): class InvalidResult(Exception):
@ -76,7 +78,7 @@ def _fix_unicode_url(url):
url = url.encode('utf-8') url = url.encode('utf-8')
return url return url
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False): def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, unicode=False):
''' '''
url - url to load url - url to load
data - possible post data data - possible post data
@ -87,8 +89,8 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
''' '''
if net.DEBUG: if net.DEBUG:
print('ox.cache.read_url', url) print('ox.cache.read_url', url)
#FIXME: send last-modified / etag from cache and only update if needed # FIXME: send last-modified / etag from cache and only update if needed
#url = _fix_unicode_url(url) # url = _fix_unicode_url(url)
result = store.get(url, data, headers, timeout) result = store.get(url, data, headers, timeout)
url_headers = {} url_headers = {}
if not result: if not result:
@ -116,7 +118,7 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
result = result.decode(encoding) result = result.decode(encoding)
return result return result
get_url=read_url get_url = read_url
def save_url(url, filename, overwrite=False): def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite: if not os.path.exists(filename) or overwrite:
@ -134,7 +136,7 @@ class Cache:
def __init__(self): def __init__(self):
pass pass
def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"): def get(self, url, data, headers=None, timeout=-1, value="data"):
''' '''
if value == 'data' return data of url if its in the cache else None if value == 'data' return data of url if its in the cache else None
if value == 'headers' return headers for url if value == 'headers' return headers for url
@ -192,7 +194,7 @@ class SQLiteCache(Cache):
def set_setting(self, c, key, value): def set_setting(self, c, key, value):
c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value))) c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"): def get(self, url, data={}, headers=None, timeout=-1, value="data"):
r = None r = None
if timeout == 0: if timeout == 0:
return r return r
@ -225,7 +227,7 @@ class SQLiteCache(Cache):
conn.close() conn.close()
return r return r
def delete(self, url, data=None, headers=DEFAULT_HEADERS): def delete(self, url, data=None, headers=None):
url_hash = self.get_url_hash(url, data) url_hash = self.get_url_hash(url, data)
conn = self.connect() conn = self.connect()
c = conn.cursor() c = conn.cursor()
@ -244,7 +246,8 @@ class SQLiteCache(Cache):
c = conn.cursor() c = conn.cursor()
# Insert a row of data # Insert a row of data
if not post_data: post_data="" if not post_data:
post_data = ""
only_headers = 0 only_headers = 0
if data == -1: if data == -1:
only_headers = 1 only_headers = 1
@ -284,7 +287,7 @@ class FileCache(Cache):
f = os.path.join(prefix, '%s.dat'%h) f = os.path.join(prefix, '%s.dat'%h)
return prefix, i, f return prefix, i, f
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"): def get(self, url, data={}, headers=None, timeout=-1, value="data"):
r = None r = None
if timeout == 0: if timeout == 0:
return r return r
@ -308,13 +311,13 @@ class FileCache(Cache):
if value == 'headers': if value == 'headers':
r = info['headers'] r = info['headers']
else: else:
with open(f, 'rb') as data: with open(f, 'rb') as fd:
r = data.read() r = fd.read()
if info['compressed']: if info['compressed']:
r = zlib.decompress(r) r = zlib.decompress(r)
return r return r
def delete(self, url, data=None, headers=DEFAULT_HEADERS): def delete(self, url, data=None, headers=None):
url_hash = self.get_url_hash(url, data) url_hash = self.get_url_hash(url, data)
domain = self.get_domain(url) domain = self.get_domain(url)

View file

@ -23,7 +23,7 @@ DEFAULT_HEADERS = {
'Accept-Encoding': 'gzip' 'Accept-Encoding': 'gzip'
} }
def status(url, data=None, headers=DEFAULT_HEADERS): def status(url, data=None, headers=None):
try: try:
f = open_url(url, data, headers) f = open_url(url, data, headers)
s = f.code s = f.code
@ -31,13 +31,13 @@ def status(url, data=None, headers=DEFAULT_HEADERS):
s = e.code s = e.code
return s return s
def exists(url, data=None, headers=DEFAULT_HEADERS): def exists(url, data=None, headers=None):
s = status(url, data, headers) s = status(url, data, headers)
if s >= 200 and s < 400: if s >= 200 and s < 400:
return True return True
return False return False
def get_headers(url, data=None, headers=DEFAULT_HEADERS): def get_headers(url, data=None, headers=None):
try: try:
f = open_url(url, data, headers) f = open_url(url, data, headers)
f.headers['Status'] = "%s" % f.code f.headers['Status'] = "%s" % f.code
@ -48,10 +48,12 @@ def get_headers(url, data=None, headers=DEFAULT_HEADERS):
headers = e.headers headers = e.headers
return dict(headers) return dict(headers)
def get_json(url, data=None, headers=DEFAULT_HEADERS): def get_json(url, data=None, headers=None):
return json.loads(read_url(url, data, headers).decode('utf-8')) return json.loads(read_url(url, data, headers).decode('utf-8')) # pylint: disable=no-member
def open_url(url, data=None, headers=DEFAULT_HEADERS): def open_url(url, data=None, headers=None):
if headers is None:
headers = DEFAULT_HEADERS.copy()
if PY2: if PY2:
if not isinstance(url, bytes): if not isinstance(url, bytes):
url = url.encode('utf-8') url = url.encode('utf-8')
@ -64,7 +66,7 @@ def open_url(url, data=None, headers=DEFAULT_HEADERS):
req = urllib.request.Request(url, data, headers) req = urllib.request.Request(url, data, headers)
return urllib.request.urlopen(req) return urllib.request.urlopen(req)
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False): def read_url(url, data=None, headers=None, return_headers=False, unicode=False):
if DEBUG: if DEBUG:
print('ox.net.read_url', url) print('ox.net.read_url', url)
f = open_url(url, data, headers) f = open_url(url, data, headers)
@ -108,7 +110,7 @@ def detect_encoding(data):
detector.close() detector.close()
return detector.result['encoding'] return detector.result['encoding']
get_url=read_url get_url = read_url
def save_url(url, filename, overwrite=False): def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite: if not os.path.exists(filename) or overwrite:
@ -119,51 +121,50 @@ def save_url(url, filename, overwrite=False):
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
f.write(data) f.write(data)
def _get_size(url):
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
req.get_method = lambda: 'HEAD'
u = urllib.request.urlopen(req)
if u.code != 200 or 'Content-Length' not in u.headers:
raise IOError
return int(u.headers['Content-Length'])
def _get_range(url, start, end):
headers = DEFAULT_HEADERS.copy()
headers['Range'] = 'bytes=%s-%s' % (start, end)
req = urllib.request.Request(url, headers=headers)
u = urllib.request.urlopen(req)
return u.read()
def oshash(url): def oshash(url):
def get_size(url):
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
req.get_method = lambda : 'HEAD'
u = urllib.request.urlopen(req)
if u.code != 200 or not 'Content-Length' in u.headers:
raise IOError
return int(u.headers['Content-Length'])
def get_range(url, start, end):
headers = DEFAULT_HEADERS.copy()
headers['Range'] = 'bytes=%s-%s' % (start, end)
req = urllib.request.Request(url, headers=headers)
u = urllib.request.urlopen(req)
return u.read()
try: try:
longlongformat = 'q' # long long longlongformat = 'q' # long long
bytesize = struct.calcsize(longlongformat) bytesize = struct.calcsize(longlongformat)
filesize = get_size(url) filesize = _get_size(url)
hash = filesize hash_ = filesize
head = get_range(url, 0, min(filesize, 65536)) head = _get_range(url, 0, min(filesize, 65536))
if filesize > 65536: if filesize > 65536:
tail = get_range(url, filesize-65536, filesize) tail = _get_range(url, filesize-65536, filesize)
if filesize < 65536: if filesize < 65536:
f = BytesIO(head) f = BytesIO(head)
for x in range(int(filesize/bytesize)): for _ in range(int(filesize/bytesize)):
buffer = f.read(bytesize) buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer) (l_value,) = struct.unpack(longlongformat, buffer)
hash += l_value hash_ += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #cut off 64bit overflow hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut off 64bit overflow
else: else:
for offset in range(0, 65536, bytesize): for offset in range(0, 65536, bytesize):
buffer = head[offset:offset+bytesize] buffer = head[offset:offset+bytesize]
(l_value,)= struct.unpack(longlongformat, buffer) (l_value,) = struct.unpack(longlongformat, buffer)
hash += l_value hash_ += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #cut of 64bit overflow hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut of 64bit overflow
for offset in range(0, 65536, bytesize): for offset in range(0, 65536, bytesize):
buffer = tail[offset:offset+bytesize] buffer = tail[offset:offset+bytesize]
(l_value,)= struct.unpack(longlongformat, buffer) (l_value,) = struct.unpack(longlongformat, buffer)
hash += l_value hash_ += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF hash_ = hash_ & 0xFFFFFFFFFFFFFFFF
returnedhash = "%016x" % hash returnedhash = "%016x" % hash_
return returnedhash return returnedhash
except(IOError): except IOError:
return "IOError" return "IOError"