From 4e7898ae575483da492b2d6d1e936a224ff7ddea Mon Sep 17 00:00:00 2001 From: j Date: Wed, 8 Jun 2016 15:30:25 +0200 Subject: [PATCH] avoid dict as default value --- ox/cache.py | 43 +++++++++++++++------------- ox/net.py | 81 +++++++++++++++++++++++++++-------------------------- 2 files changed, 64 insertions(+), 60 deletions(-) diff --git a/ox/cache.py b/ox/cache.py index b8264de..b08e8d2 100644 --- a/ox/cache.py +++ b/ox/cache.py @@ -4,14 +4,15 @@ from __future__ import with_statement, print_function import gzip -import zlib import hashlib import os -from six import BytesIO +import sqlite3 import time +import zlib + +from six import BytesIO from six.moves import urllib from six import PY2 -import sqlite3 from .utils import json from .file import makedirs @@ -19,7 +20,8 @@ from .file import makedirs from . import net from .net import DEFAULT_HEADERS, detect_encoding -cache_timeout = 30*24*60*60 # default is 30 days + +cache_timeout = 30*24*60*60 # default is 30 days COMPRESS_TYPES = ( 'text/html', @@ -33,7 +35,7 @@ COMPRESS_TYPES = ( 'application/rss+xml' ) -def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): +def status(url, data=None, headers=None, timeout=cache_timeout): ''' >>> status('http://google.com') 200 @@ -43,7 +45,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): headers = get_headers(url, data, headers) return int(headers['status']) -def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): +def exists(url, data=None, headers=None, timeout=cache_timeout): ''' >>> exists('http://google.com') True @@ -55,14 +57,14 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): return True return False -def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): +def get_headers(url, data=None, headers=None, timeout=cache_timeout): url_headers = store.get(url, data, headers, timeout, "headers") if not url_headers: url_headers = net.get_headers(url, data, headers) store.set(url, data, -1, url_headers) return url_headers -def get_json(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): +def get_json(url, data=None, headers=None, timeout=cache_timeout): return json.loads(read_url(url, data, headers, timeout).decode('utf-8')) class InvalidResult(Exception): @@ -76,7 +78,7 @@ def _fix_unicode_url(url): url = url.encode('utf-8') return url -def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False): +def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, unicode=False): ''' url - url to load data - possible post data @@ -87,8 +89,8 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val ''' if net.DEBUG: print('ox.cache.read_url', url) - #FIXME: send last-modified / etag from cache and only update if needed - #url = _fix_unicode_url(url) + # FIXME: send last-modified / etag from cache and only update if needed + # url = _fix_unicode_url(url) result = store.get(url, data, headers, timeout) url_headers = {} if not result: @@ -116,7 +118,7 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val result = result.decode(encoding) return result -get_url=read_url +get_url = read_url def save_url(url, filename, overwrite=False): if not os.path.exists(filename) or overwrite: @@ -134,7 +136,7 @@ class Cache: def __init__(self): pass - def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"): + def get(self, url, data, headers=None, timeout=-1, value="data"): ''' if value == 'data' return data of url if its in the cache else None if value == 'headers' return headers for url @@ -192,7 +194,7 @@ class SQLiteCache(Cache): def set_setting(self, c, key, value): c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value))) - def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"): + def get(self, url, data={}, headers=None, timeout=-1, value="data"): r = None if timeout == 0: return r @@ -225,7 +227,7 @@ class SQLiteCache(Cache): conn.close() return r - def delete(self, url, data=None, headers=DEFAULT_HEADERS): + def delete(self, url, data=None, headers=None): url_hash = self.get_url_hash(url, data) conn = self.connect() c = conn.cursor() @@ -244,7 +246,8 @@ class SQLiteCache(Cache): c = conn.cursor() # Insert a row of data - if not post_data: post_data="" + if not post_data: + post_data = "" only_headers = 0 if data == -1: only_headers = 1 @@ -284,7 +287,7 @@ class FileCache(Cache): f = os.path.join(prefix, '%s.dat'%h) return prefix, i, f - def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"): + def get(self, url, data={}, headers=None, timeout=-1, value="data"): r = None if timeout == 0: return r @@ -308,13 +311,13 @@ class FileCache(Cache): if value == 'headers': r = info['headers'] else: - with open(f, 'rb') as data: - r = data.read() + with open(f, 'rb') as fd: + r = fd.read() if info['compressed']: r = zlib.decompress(r) return r - def delete(self, url, data=None, headers=DEFAULT_HEADERS): + def delete(self, url, data=None, headers=None): url_hash = self.get_url_hash(url, data) domain = self.get_domain(url) diff --git a/ox/net.py b/ox/net.py index 46ef0e1..6afde25 100644 --- a/ox/net.py +++ b/ox/net.py @@ -23,7 +23,7 @@ DEFAULT_HEADERS = { 'Accept-Encoding': 'gzip' } -def status(url, data=None, headers=DEFAULT_HEADERS): +def status(url, data=None, headers=None): try: f = open_url(url, data, headers) s = f.code @@ -31,13 +31,13 @@ def status(url, data=None, headers=DEFAULT_HEADERS): s = e.code return s -def exists(url, data=None, headers=DEFAULT_HEADERS): +def exists(url, data=None, headers=None): s = status(url, data, headers) if s >= 200 and s < 400: return True return False -def get_headers(url, data=None, headers=DEFAULT_HEADERS): +def get_headers(url, data=None, headers=None): try: f = open_url(url, data, headers) f.headers['Status'] = "%s" % f.code @@ -48,10 +48,12 @@ def get_headers(url, data=None, headers=DEFAULT_HEADERS): headers = e.headers return dict(headers) -def get_json(url, data=None, headers=DEFAULT_HEADERS): - return json.loads(read_url(url, data, headers).decode('utf-8')) +def get_json(url, data=None, headers=None): + return json.loads(read_url(url, data, headers).decode('utf-8')) # pylint: disable=no-member -def open_url(url, data=None, headers=DEFAULT_HEADERS): +def open_url(url, data=None, headers=None): + if headers is None: + headers = DEFAULT_HEADERS.copy() if PY2: if not isinstance(url, bytes): url = url.encode('utf-8') @@ -64,7 +66,7 @@ def open_url(url, data=None, headers=DEFAULT_HEADERS): req = urllib.request.Request(url, data, headers) return urllib.request.urlopen(req) -def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False): +def read_url(url, data=None, headers=None, return_headers=False, unicode=False): if DEBUG: print('ox.net.read_url', url) f = open_url(url, data, headers) @@ -108,7 +110,7 @@ def detect_encoding(data): detector.close() return detector.result['encoding'] -get_url=read_url +get_url = read_url def save_url(url, filename, overwrite=False): if not os.path.exists(filename) or overwrite: @@ -119,51 +121,50 @@ def save_url(url, filename, overwrite=False): with open(filename, 'wb') as f: f.write(data) +def _get_size(url): + req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy()) + req.get_method = lambda: 'HEAD' + u = urllib.request.urlopen(req) + if u.code != 200 or 'Content-Length' not in u.headers: + raise IOError + return int(u.headers['Content-Length']) + +def _get_range(url, start, end): + headers = DEFAULT_HEADERS.copy() + headers['Range'] = 'bytes=%s-%s' % (start, end) + req = urllib.request.Request(url, headers=headers) + u = urllib.request.urlopen(req) + return u.read() + def oshash(url): - def get_size(url): - req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy()) - req.get_method = lambda : 'HEAD' - u = urllib.request.urlopen(req) - if u.code != 200 or not 'Content-Length' in u.headers: - raise IOError - return int(u.headers['Content-Length']) - - def get_range(url, start, end): - headers = DEFAULT_HEADERS.copy() - headers['Range'] = 'bytes=%s-%s' % (start, end) - req = urllib.request.Request(url, headers=headers) - u = urllib.request.urlopen(req) - return u.read() - try: longlongformat = 'q' # long long bytesize = struct.calcsize(longlongformat) - filesize = get_size(url) - hash = filesize - head = get_range(url, 0, min(filesize, 65536)) + filesize = _get_size(url) + hash_ = filesize + head = _get_range(url, 0, min(filesize, 65536)) if filesize > 65536: - tail = get_range(url, filesize-65536, filesize) + tail = _get_range(url, filesize-65536, filesize) if filesize < 65536: f = BytesIO(head) - for x in range(int(filesize/bytesize)): + for _ in range(int(filesize/bytesize)): buffer = f.read(bytesize) - (l_value,)= struct.unpack(longlongformat, buffer) - hash += l_value - hash = hash & 0xFFFFFFFFFFFFFFFF #cut off 64bit overflow + (l_value,) = struct.unpack(longlongformat, buffer) + hash_ += l_value + hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut off 64bit overflow else: for offset in range(0, 65536, bytesize): buffer = head[offset:offset+bytesize] - (l_value,)= struct.unpack(longlongformat, buffer) - hash += l_value - hash = hash & 0xFFFFFFFFFFFFFFFF #cut of 64bit overflow + (l_value,) = struct.unpack(longlongformat, buffer) + hash_ += l_value + hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut of 64bit overflow for offset in range(0, 65536, bytesize): buffer = tail[offset:offset+bytesize] - (l_value,)= struct.unpack(longlongformat, buffer) - hash += l_value - hash = hash & 0xFFFFFFFFFFFFFFFF - returnedhash = "%016x" % hash + (l_value,) = struct.unpack(longlongformat, buffer) + hash_ += l_value + hash_ = hash_ & 0xFFFFFFFFFFFFFFFF + returnedhash = "%016x" % hash_ return returnedhash - except(IOError): + except IOError: return "IOError" -