diff --git a/ox/__init__.py b/ox/__init__.py index 4620e8f..98402fb 100644 --- a/ox/__init__.py +++ b/ox/__init__.py @@ -5,7 +5,7 @@ try: from . import __version __version__ = __version.VERSION except: - __version__ = '3.0.x' + __version__ = '2.3.x' from . import cache from . import js diff --git a/ox/api.py b/ox/api.py index 88af34f..639fec0 100644 --- a/ox/api.py +++ b/ox/api.py @@ -10,10 +10,10 @@ import shutil import sys import time -from http import cookiejar as cookielib -from io import BytesIO -import urllib -from urllib.parse import urlparse +from six.moves import http_cookiejar as cookielib +from six import BytesIO, PY2 +from six.moves import urllib +from six.moves.urllib.parse import urlparse import requests from . import __version__ @@ -56,7 +56,10 @@ class API(object): def _add_method(self, method, name): if name is None: name = method.func_name - setattr(self, name, MethodType(method, self)) + if PY2: + setattr(self, name, MethodType(method, self, type(self))) + else: + setattr(self, name, MethodType(method, self)) def _add_action(self, action): def method(self, *args, **kw): @@ -70,7 +73,10 @@ class API(object): return self._request(action, kw) if 'doc' in self._properties[action]: method.__doc__ = self._properties[action]['doc'] - method.func_name = action + if PY2: + method.func_name = str(action) + else: + method.func_name = action self._add_method(method, action) def _json_request(self, url, data, files=None): diff --git a/ox/cache.py b/ox/cache.py index 3954ea7..ba41574 100644 --- a/ox/cache.py +++ b/ox/cache.py @@ -10,11 +10,15 @@ import sqlite3 import time import zlib -from io import BytesIO -import urllib -import requests -from requests.structures import CaseInsensitiveDict - +from six import BytesIO +from six.moves import urllib +from six import PY2 +try: + import requests + USE_REQUESTS = True + requests_session = requests.Session() +except: + USE_REQUESTS = False from .utils import json from .file import makedirs @@ -24,7 +28,6 @@ from .net import DEFAULT_HEADERS, detect_encoding cache_timeout = 30*24*60*60 # default is 30 days -requests_session = requests.Session() COMPRESS_TYPES = ( 'text/html', @@ -66,7 +69,7 @@ def get_headers(url, data=None, headers=None, timeout=cache_timeout): if not url_headers: url_headers = net.get_headers(url, data, headers) store.set(url, data, -1, url_headers) - return CaseInsensitiveDict(url_headers) + return url_headers def get_json(url, data=None, headers=None, timeout=cache_timeout): return json.loads(read_url(url, data, headers, timeout).decode('utf-8')) @@ -98,20 +101,35 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un result = store.get(url, data, headers, timeout) url_headers = {} if not result: - if headers is None: - headers = DEFAULT_HEADERS.copy() - if data: - r = requests_session.post(url, data=data, headers=headers) + if USE_REQUESTS: + if headers is None: + headers = DEFAULT_HEADERS.copy() + if data: + r = requests_session.post(url, data=data, headers=headers) + else: + r = requests_session.get(url, headers=headers) + for key in r.headers: + url_headers[key.lower()] = r.headers[key] + result = r.content + url_headers['Status'] = "%s" % r.status_code + if not valid or valid(result, url_headers): + store.set(url, post_data=data, data=result, headers=url_headers) + else: + raise InvalidResult(result, url_headers) else: - r = requests_session.get(url, headers=headers) - for key in r.headers: - url_headers[key.lower()] = r.headers[key] - result = r.content - url_headers['Status'] = "%s" % r.status_code - if not valid or valid(result, url_headers): - store.set(url, post_data=data, data=result, headers=url_headers) - else: - raise InvalidResult(result, url_headers) + try: + url_headers, result = net.read_url(url, data, headers, return_headers=True) + except urllib.error.HTTPError as e: + e.headers['Status'] = "%s" % e.code + for key in e.headers: + url_headers[key.lower()] = e.headers[key] + result = e.read() + if url_headers.get('content-encoding', None) == 'gzip': + result = gzip.GzipFile(fileobj=BytesIO(result)).read() + if not valid or valid(result, url_headers): + store.set(url, post_data=data, data=result, headers=url_headers) + else: + raise InvalidResult(result, url_headers) if unicode: ctype = url_headers.get('content-type', '').lower() if 'charset' in ctype: @@ -224,6 +242,8 @@ class SQLiteCache(Cache): elif value == 'data': if row[1] == 1: r = zlib.decompress(r) + elif PY2: + r = str(r) break c.close() @@ -262,8 +282,6 @@ class SQLiteCache(Cache): data = zlib.compress(data) else: compressed = 0 - if isinstance(data, str): - data = data.encode("utf-8") data = sqlite3.Binary(data) #fixme: this looks wrong diff --git a/ox/fixunicode.py b/ox/fixunicode.py index e0386c6..d3a162d 100644 --- a/ox/fixunicode.py +++ b/ox/fixunicode.py @@ -6,6 +6,7 @@ from __future__ import print_function import unicodedata +from six import unichr, text_type __all__ = ['fix_bad_unicode'] @@ -150,7 +151,7 @@ def text_badness(text): - Improbable single-byte characters, such as ƒ or ¬ - Letters in somewhat rare scripts ''' - assert isinstance(text, str) + assert isinstance(text, text_type) errors = 0 very_weird_things = 0 weird_things = 0 @@ -288,7 +289,7 @@ SINGLE_BYTE_WEIRDNESS = ( # Pre-cache the Unicode data saying which of these first 256 characters are # letters. We'll need it often. SINGLE_BYTE_LETTERS = [ - unicodedata.category(chr(i)).startswith('L') + unicodedata.category(unichr(i)).startswith('L') for i in range(256) ] diff --git a/ox/form.py b/ox/form.py index 1a182c2..faa1551 100644 --- a/ox/form.py +++ b/ox/form.py @@ -9,6 +9,8 @@ import os import hashlib import sys +from six import PY2 + __all__ = ['MultiPartForm'] @@ -61,6 +63,8 @@ class MultiPartForm(object): def __str__(self): body = self.body() + if not PY2: + body = body.decode('utf-8') return body def body(self): diff --git a/ox/format.py b/ox/format.py index 83756c1..ad18c31 100644 --- a/ox/format.py +++ b/ox/format.py @@ -4,6 +4,8 @@ import math import re import string +from six import text_type + def toAZ(num): """ Converts an integer to bijective base 26 string using A-Z @@ -106,7 +108,7 @@ def to32(q): >>> to32(555306645) 'GHJKMN' - >>> to32(800197332334559) + >>> to32(800197332334559L) 'PQRSTVWXYZ' >>> to32(32) @@ -224,36 +226,36 @@ def to36(q): def from36(q): return int(q, 36) -def int_value(strValue, default=''): +def int_value(strValue, default=u''): """ >>> int_value('abc23') - '23' + u'23' >>> int_value(' abc23') - '23' + u'23' >>> int_value('ab') - '' + u'' """ try: - val = re.compile('(\d+)').findall(str(strValue).strip())[0] + val = re.compile('(\d+)').findall(text_type(strValue).strip())[0] except: val = default return val -def float_value(strValue, default=''): +def float_value(strValue, default=u''): """ >>> float_value('abc23.4') - '23.4' + u'23.4' >>> float_value(' abc23.4') - '23.4' + u'23.4' >>> float_value('ab') - '' + u'' """ try: - val = re.compile('([\d.]+)').findall(str(strValue).strip())[0] + val = re.compile('([\d.]+)').findall(text_type(strValue).strip())[0] except: val = default return val diff --git a/ox/html.py b/ox/html.py index 8666713..73234ea 100644 --- a/ox/html.py +++ b/ox/html.py @@ -3,7 +3,8 @@ # GPL 2008 import re import string -from html.entities import name2codepoint +from six.moves.html_entities import name2codepoint +from six import unichr, PY2, string_types letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' @@ -25,7 +26,8 @@ link_target_attribute_re = re.compile(r'(]*?)target=[^\s>]+') html_gunk_re = re.compile(r'(?:
|<\/i>|<\/b>|<\/em>|<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) hard_coded_bullets_re = re.compile(r'((?:

(?:%s).*?[a-zA-Z].*?

\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) trailing_empty_content_re = re.compile(r'(?:

(?: |\s|
)*?

\s*)+\Z') - +if PY2: + del x # Temporary variable def escape(html): ''' @@ -34,7 +36,7 @@ def escape(html): >>> escape('html "test" & ') 'html "test" & <brothers>' ''' - if not isinstance(html, str): + if not isinstance(html, string_types): html = str(html) return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') @@ -145,20 +147,20 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?') def decode_html(html): """ >>> decode_html('me & you and $&%') - 'me & you and $&%' + u'me & you and $&%' >>> decode_html('€') - '\u20ac' + u'\u20ac' >>> decode_html('Anniversary of Daoud's Republic') - "Anniversary of Daoud's Republic" + u"Anniversary of Daoud's Republic" """ if isinstance(html, bytes): html = html.decode('utf-8') - uchr = chr + uchr = unichr def entitydecode(match, uchr=uchr): entity = match.group(1) if entity == '#x80': - return '€' + return u'€' elif entity.startswith('#x'): return uchr(int(entity[2:], 16)) elif entity.startswith('#'): @@ -169,7 +171,7 @@ def decode_html(html): return "'" else: return match.group(0) - return charrefpat.sub(entitydecode, html).replace('\xa0', ' ') + return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ') def highlight(text, query, hlClass="hl"): """ @@ -187,51 +189,51 @@ def highlight(text, query, hlClass="hl"): def escape_html(value): ''' - >>> escape_html('') - '<script>alert()</script>' + u'<script>alert()</script>' >>> sanitize_html("'foo' < 'bar' && \\"foo\\" > \\"bar\\"") - '\\'foo\\' < \\'bar\\' && "foo" > "bar"' + u'\\'foo\\' < \\'bar\\' && "foo" > "bar"' >>> sanitize_html('foo') - 'foo' + u'foo' >>> sanitize_html('foo') - 'foo' + u'foo' >>> sanitize_html('Anniversary of Daoud's Republic') - "Anniversary of Daoud's Republic" + u"Anniversary of Daoud's Republic" >>> sanitize_html('') - '' + u'' >>> sanitize_html(' ') - ' ' - >>> sanitize_html(' ') # canonicalised to a space: okay, I suppose - ' ' - >>> sanitize_html('\u00a0') # also nbsp - ' ' + u' ' + >>> sanitize_html(u' ') # canonicalised to a space: okay, I suppose + u' ' + >>> sanitize_html(u'\u00a0') # also nbsp + u' ' ''' if not tags: valid_url = '^((https?:\/\/|\/|mailto:).*?)' @@ -412,24 +414,24 @@ def sanitize_fragment(html): are quoted, etc. Does not strip potentially-malicious HTML: use sanitize_html() for that. - >>> sanitize_fragment('') - '' - >>> sanitize_fragment('') - '' - >>> sanitize_fragment('


') - '

' - >>> sanitize_fragment('
foo') - 'foo' - >>> sanitize_fragment('') - '' - >>> sanitize_fragment(' ') - ' ' - >>> sanitize_fragment(' ') - '\\xa0' - >>> sanitize_fragment('\\u00a0') # nbsp - '\\xa0' - >>> sanitize_fragment('\\ufeff') # zero-width no-break space - '\\ufeff' + >>> sanitize_fragment(u'') + u'' + >>> sanitize_fragment(u'') + u'' + >>> sanitize_fragment(u'


') + u'

' + >>> sanitize_fragment(u'foo') + u'foo' + >>> sanitize_fragment(u'') + u'' + >>> sanitize_fragment(u' ') + u' ' + >>> sanitize_fragment(u' ') + u'\\xa0' + >>> sanitize_fragment(u'\\u00a0') # nbsp + u'\\xa0' + >>> sanitize_fragment(u'\\ufeff') # zero-width no-break space + u'\\ufeff' ''' ''' @@ -440,12 +442,7 @@ def sanitize_fragment(html): if not html.strip(): return html import lxml.html - try: - body = lxml.html.document_fromstring(html).find('body') - except lxml.etree.ParserError as e: - if e.args and e.args[0] == 'Document is empty': - return html - raise e + body = lxml.html.document_fromstring(html).find('body') html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8') if html.startswith('

') and html.endswith('

'): html = html[3:-4] diff --git a/ox/js.py b/ox/js.py index 9e9f1cb..2f419bd 100644 --- a/ox/js.py +++ b/ox/js.py @@ -2,12 +2,19 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 +from six import PY2 from .utils import json def minify(source, comment=''): # see https://github.com/douglascrockford/JSMin/blob/master/README def get_next_non_whitespace_token(): pass + # python2 performance with unicode string is terrible + if PY2: + if isinstance(source, unicode): # pylint: disable=undefined-variable + source = source.encode('utf-8') + if isinstance(comment, unicode): # pylint: disable=undefined-variable + comment = comment.encode('utf-8') tokens = tokenize(source) length = len(tokens) minified = '/*' + comment + '*/' if comment else '' diff --git a/ox/net.py b/ox/net.py index 4d58bad..3a07d91 100644 --- a/ox/net.py +++ b/ox/net.py @@ -8,10 +8,13 @@ import os import re import struct -import requests - -from io import BytesIO -import urllib +try: + import requests + USE_REQUESTS = True +except: + USE_REQUESTS = False +from six import BytesIO, PY2 +from six.moves import urllib from chardet.universaldetector import UniversalDetector @@ -56,10 +59,14 @@ def get_json(url, data=None, headers=None): def open_url(url, data=None, headers=None): if headers is None: headers = DEFAULT_HEADERS.copy() - if isinstance(url, bytes): - url = url.decode('utf-8') + if PY2: + if not isinstance(url, bytes): + url = url.encode('utf-8') + else: + if isinstance(url, bytes): + url = url.decode('utf-8') url = url.replace(' ', '%20') - if data and not isinstance(data, bytes): + if data and not PY2 and not isinstance(data, bytes): data = data.encode('utf-8') req = urllib.request.Request(url, data, headers) return urllib.request.urlopen(req) @@ -116,11 +123,16 @@ def save_url(url, filename, overwrite=False): if dirname and not os.path.exists(dirname): os.makedirs(dirname) headers = DEFAULT_HEADERS.copy() - r = requests.get(url, headers=headers, stream=True) - with open(filename, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - f.write(chunk) + if USE_REQUESTS: + r = requests.get(url, headers=headers, stream=True) + with open(filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + else: + data = read_url(url) + with open(filename, 'wb') as f: + f.write(data) def _get_size(url): req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy()) diff --git a/ox/normalize.py b/ox/normalize.py index 4ee9293..dea40ae 100644 --- a/ox/normalize.py +++ b/ox/normalize.py @@ -4,6 +4,8 @@ import re import unicodedata +from six import string_types + _articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el', "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de', @@ -101,7 +103,7 @@ def normalize_imdbid(imdbId): >>> normalize_imdbid('tt0159206') '0159206' """ - if isinstance(imdbId, str): + if isinstance(imdbId, string_types): imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId) elif isinstance(imdbId, int): imdbId = "%07d" % imdbId diff --git a/ox/srt.py b/ox/srt.py index 464c08e..c29ae8b 100644 --- a/ox/srt.py +++ b/ox/srt.py @@ -5,6 +5,7 @@ import codecs import re import chardet +from six import PY2 import ox @@ -23,7 +24,10 @@ def _detect_encoding(fp): # go to beginning of file and get the first 4 bytes oldFP = fp.tell() fp.seek(0) - (byte1, byte2, byte3, byte4) = fp.read(4) + if PY2: + (byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)] + else: + (byte1, byte2, byte3, byte4) = fp.read(4) # try bom detection using 4 bytes, 3 bytes, or 2 bytes bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) diff --git a/ox/text.py b/ox/text.py index d650262..282afa2 100644 --- a/ox/text.py +++ b/ox/text.py @@ -1,13 +1,11 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 # GPL 2008 -import gzip import math import re import unicodedata -from io import BytesIO -from functools import reduce +from six.moves import reduce ARTICLES = list(set([ # def sg, def pl, indef sg, indef pl (each m/f/n) @@ -475,10 +473,10 @@ def wrap(text, width): def wrap_string(string, length=80, separator='\n', balance=False): ''' - >>> wrap_string("Anticonstitutionellement, Paris s'eveille", 16) - "Anticonstitution\\nellement, Paris \\ns'eveille" + >>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16) + u"Anticonstitution\\nellement, Paris \\ns'eveille" >>> wrap_string(u'All you can eat', 12, '\\n', True) - 'All you \\ncan eat' + u'All you \\ncan eat' ''' words = string.split(' ') if balance: @@ -493,20 +491,20 @@ def wrap_string(string, length=80, separator='\n', balance=False): break lines = [''] for word in words: - if len(lines[len(lines) - 1] + word + ' ') <= length + 1: + if len(lines[len(lines) - 1] + word + u' ') <= length + 1: # word fits in current line - lines[len(lines) - 1] += word + ' ' + lines[len(lines) - 1] += word + u' ' else: if len(word) <= length: # word fits in next line - lines.append(word + ' ') + lines.append(word + u' ') else: # word is longer than line position = length - len(lines[len(lines) - 1]) lines[len(lines) - 1] += word[0:position] for i in range(position, len(word), length): lines.append(word[i:i+length]) - lines[len(lines) - 1] += ' ' + lines[len(lines) - 1] += u' ' return separator.join(lines).strip() def truncate_string(string, length, padding='...', position='right'): @@ -578,14 +576,14 @@ def get_valid_filename(s): def get_text_list(list_, last_word='or'): """ - >>> get_text_list(['a', 'b', 'c', 'd']) - 'a, b, c or d' - >>> get_text_list(['a', 'b', 'c'], 'and') - 'a, b and c' - >>> get_text_list(['a', 'b'], 'and') - 'a and b' - >>> get_text_list(['a']) - 'a' + >>> get_text_list([u'a', u'b', u'c', u'd']) + u'a, b, c or d' + >>> get_text_list([u'a', u'b', u'c'], 'and') + u'a, b and c' + >>> get_text_list([u'a', u'b'], 'and') + u'a and b' + >>> get_text_list([u'a']) + u'a' >>> get_text_list([]) '' """ @@ -593,24 +591,24 @@ def get_text_list(list_, last_word='or'): return '' if len(list_) == 1: return list_[0] - return '%s %s %s' % (', '.join([i for i in list_][:-1]), last_word, list_[-1]) + return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1]) def get_list_text(text, last_word='or'): """ - >>> get_list_text('a, b, c or d') - ['a', 'b', 'c', 'd'] - >>> get_list_text('a, b and c', 'and') - ['a', 'b', 'c'] - >>> get_list_text('a and b', 'and') - ['a', 'b'] - >>> get_list_text('a') - ['a'] - >>> get_list_text('') + >>> get_list_text(u'a, b, c or d') + [u'a', u'b', u'c', u'd'] + >>> get_list_text(u'a, b and c', u'and') + [u'a', u'b', u'c'] + >>> get_list_text(u'a and b', u'and') + [u'a', u'b'] + >>> get_list_text(u'a') + [u'a'] + >>> get_list_text(u'') [] """ list_ = [] if text: - list_ = text.split(', ') + list_ = text.split(u', ') if list_: i = len(list_)-1 last = list_[i].split(last_word) @@ -648,6 +646,8 @@ def phone2numeric(phone): return letters.sub(char2number, phone) def compress_string(s): + import gzip + from six import BytesIO zbuf = BytesIO() zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile.write(s) @@ -682,7 +682,7 @@ def words(text): return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text] def sort_string(string): - string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th') + string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th') # pad numbered titles string = re.sub('(\d),(\d{3})', '\\1\\2', string) diff --git a/ox/torrent/__init__.py b/ox/torrent/__init__.py index 9c399fe..a250215 100644 --- a/ox/torrent/__init__.py +++ b/ox/torrent/__init__.py @@ -5,8 +5,12 @@ from threading import Event from hashlib import sha1 import os +from six import PY2 -from .bencode3 import bencode, bdecode +if PY2: + from .bencode import bencode, bdecode +else: + from .bencode3 import bencode, bdecode __all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size'] diff --git a/ox/torrent/bencode.py b/ox/torrent/bencode.py new file mode 100644 index 0000000..b586001 --- /dev/null +++ b/ox/torrent/bencode.py @@ -0,0 +1,321 @@ +# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman +# see LICENSE.txt for license information +from __future__ import print_function + +from types import IntType, LongType, StringType, ListType, TupleType, DictType +try: + from types import BooleanType +except ImportError: + BooleanType = None +try: + from types import UnicodeType +except ImportError: + UnicodeType = None +from cStringIO import StringIO + +def decode_int(x, f): + f += 1 + newf = x.index('e', f) + try: + n = int(x[f:newf]) + except: + n = long(x[f:newf]) + if x[f] == '-': + if x[f + 1] == '0': + raise ValueError + elif x[f] == '0' and newf != f+1: + raise ValueError + return (n, newf+1) + +def decode_string(x, f): + colon = x.index(':', f) + try: + n = int(x[f:colon]) + except (OverflowError, ValueError): + n = long(x[f:colon]) + if x[f] == '0' and colon != f+1: + raise ValueError + colon += 1 + return (x[colon:colon+n], colon+n) + +def decode_unicode(x, f): + s, f = decode_string(x, f+1) + return (s.decode('UTF-8'),f) + +def decode_list(x, f): + r, f = [], f+1 + while x[f] != 'e': + v, f = decode_func[x[f]](x, f) + r.append(v) + return (r, f + 1) + +def decode_dict(x, f): + r, f = {}, f+1 + lastkey = None + while x[f] != 'e': + k, f = decode_string(x, f) + # why is this needed + # if lastkey >= k: + # raise ValueError + lastkey = k + r[k], f = decode_func[x[f]](x, f) + return (r, f + 1) + +decode_func = {} +decode_func['l'] = decode_list +decode_func['d'] = decode_dict +decode_func['i'] = decode_int +decode_func['0'] = decode_string +decode_func['1'] = decode_string +decode_func['2'] = decode_string +decode_func['3'] = decode_string +decode_func['4'] = decode_string +decode_func['5'] = decode_string +decode_func['6'] = decode_string +decode_func['7'] = decode_string +decode_func['8'] = decode_string +decode_func['9'] = decode_string +#decode_func['u'] = decode_unicode + +def bdecode(x, sloppy = 1): + try: + r, l = decode_func[x[0]](x, 0) +# except (IndexError, KeyError): + except (IndexError, KeyError, ValueError): + raise ValueError("bad bencoded data") + if not sloppy and l != len(x): + raise ValueError("bad bencoded data") + return r + +def test_bdecode(): + try: + bdecode('0:0:') + assert 0 + except ValueError: + pass + try: + bdecode('ie') + assert 0 + except ValueError: + pass + try: + bdecode('i341foo382e') + assert 0 + except ValueError: + pass + assert bdecode('i4e') == 4 + assert bdecode('i0e') == 0 + assert bdecode('i123456789e') == 123456789 + assert bdecode('i-10e') == -10 + try: + bdecode('i-0e') + assert 0 + except ValueError: + pass + try: + bdecode('i123') + assert 0 + except ValueError: + pass + try: + bdecode('') + assert 0 + except ValueError: + pass + try: + bdecode('i6easd') + assert 0 + except ValueError: + pass + try: + bdecode('35208734823ljdahflajhdf') + assert 0 + except ValueError: + pass + try: + bdecode('2:abfdjslhfld') + assert 0 + except ValueError: + pass + assert bdecode('0:') == '' + assert bdecode('3:abc') == 'abc' + assert bdecode('10:1234567890') == '1234567890' + try: + bdecode('02:xy') + assert 0 + except ValueError: + pass + try: + bdecode('l') + assert 0 + except ValueError: + pass + assert bdecode('le') == [] + try: + bdecode('leanfdldjfh') + assert 0 + except ValueError: + pass + assert bdecode('l0:0:0:e') == ['', '', ''] + try: + bdecode('relwjhrlewjh') + assert 0 + except ValueError: + pass + assert bdecode('li1ei2ei3ee') == [1, 2, 3] + assert bdecode('l3:asd2:xye') == ['asd', 'xy'] + assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]] + try: + bdecode('d') + assert 0 + except ValueError: + pass + try: + bdecode('defoobar') + assert 0 + except ValueError: + pass + assert bdecode('de') == {} + assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'} + assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}} + try: + bdecode('d3:fooe') + assert 0 + except ValueError: + pass + try: + bdecode('di1e0:e') + assert 0 + except ValueError: + pass + try: + bdecode('d1:b0:1:a0:e') + assert 0 + except ValueError: + pass + try: + bdecode('d1:a0:1:a0:e') + assert 0 + except ValueError: + pass + try: + bdecode('i03e') + assert 0 + except ValueError: + pass + try: + bdecode('l01:ae') + assert 0 + except ValueError: + pass + try: + bdecode('9999:x') + assert 0 + except ValueError: + pass + try: + bdecode('l0:') + assert 0 + except ValueError: + pass + try: + bdecode('d0:0:') + assert 0 + except ValueError: + pass + try: + bdecode('d0:') + assert 0 + except ValueError: + pass + +bencached_marker = [] + +class Bencached: + def __init__(self, s): + self.marker = bencached_marker + self.bencoded = s + +BencachedType = type(Bencached('')) # insufficient, but good as a filter + +def encode_bencached(x,r): + assert x.marker == bencached_marker + r.append(x.bencoded) + +def encode_int(x,r): + r.extend(('i',str(x),'e')) + +def encode_bool(x,r): + encode_int(int(x),r) + +def encode_string(x,r): + r.extend((str(len(x)),':',x)) + +def encode_unicode(x,r): + #r.append('u') + encode_string(x.encode('UTF-8'),r) + +def encode_list(x,r): + r.append('l') + for e in x: + encode_func[type(e)](e, r) + r.append('e') + +def encode_dict(x,r): + r.append('d') + ilist = x.items() + ilist.sort() + for k,v in ilist: + r.extend((str(len(k)),':',k)) + encode_func[type(v)](v, r) + r.append('e') + +encode_func = {} +encode_func[BencachedType] = encode_bencached +encode_func[IntType] = encode_int +encode_func[LongType] = encode_int +encode_func[StringType] = encode_string +encode_func[ListType] = encode_list +encode_func[TupleType] = encode_list +encode_func[DictType] = encode_dict +if BooleanType: + encode_func[BooleanType] = encode_bool +if UnicodeType: + encode_func[UnicodeType] = encode_unicode + +def bencode(x): + r = [] + try: + encode_func[type(x)](x, r) + except: + print("*** error *** could not encode type %s (value: %s)" % (type(x), x)) + assert 0 + return ''.join(r) + +def test_bencode(): + assert bencode(4) == 'i4e' + assert bencode(0) == 'i0e' + assert bencode(-10) == 'i-10e' + assert bencode(12345678901234567890) == 'i12345678901234567890e' + assert bencode('') == '0:' + assert bencode('abc') == '3:abc' + assert bencode('1234567890') == '10:1234567890' + assert bencode([]) == 'le' + assert bencode([1, 2, 3]) == 'li1ei2ei3ee' + assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee' + assert bencode({}) == 'de' + assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee' + assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee' + try: + bencode({1: 'foo'}) + assert 0 + except AssertionError: + pass + + +try: + import psyco + psyco.bind(bdecode) + psyco.bind(bencode) +except ImportError: + pass diff --git a/ox/torrent/makemetafile.py b/ox/torrent/makemetafile.py index c2db27a..31d6ebe 100644 --- a/ox/torrent/makemetafile.py +++ b/ox/torrent/makemetafile.py @@ -8,7 +8,11 @@ from hashlib import sha1 as sha from copy import copy import re -from .bencode3 import bencode +from six import PY2 +if PY2: + from .bencode import bencode +else: + from .bencode3 import bencode from threading import Event from time import time from traceback import print_exc diff --git a/ox/web/allmovie.py b/ox/web/allmovie.py index c94c438..fdb7a46 100644 --- a/ox/web/allmovie.py +++ b/ox/web/allmovie.py @@ -13,13 +13,13 @@ def get_id(url): def get_data(id): ''' >>> get_data('129689')['cast'][1][1] - 'Marianne' + u'Marianne' >>> get_data('129689')['credits'][0][0] - 'Jean-Luc Godard' + u'Jean-Luc Godard' >>> get_data('129689')['posters'][0] - 'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' + u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' >>> get_data('129689')['rating'] - '4.5' + u'4.5' ''' if id.startswith('http'): id = get_id(id) diff --git a/ox/web/amazon.py b/ox/web/amazon.py index d721d5c..19a72c7 100644 --- a/ox/web/amazon.py +++ b/ox/web/amazon.py @@ -2,7 +2,7 @@ # vi:si:et:sw=4:sts=4:ts=4 from __future__ import print_function import re -from urllib.parse import quote +from six.moves.urllib.parse import quote from ox import find_re, strip_tags, decode_html from ox.cache import read_url diff --git a/ox/web/apple.py b/ox/web/apple.py index 84abba0..099d6cb 100644 --- a/ox/web/apple.py +++ b/ox/web/apple.py @@ -2,6 +2,7 @@ from __future__ import print_function import json import re +from six import text_type from ox.cache import read_url HEADERS = { @@ -16,9 +17,9 @@ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) ' USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3' def get_movie_data(title, director): - if isinstance(title, str): + if isinstance(title, text_type): title = title.encode('utf-8') - if isinstance(director, str): + if isinstance(director, text_type): director = director.encode('utf-8') data = {} # itunes section (preferred source for link) diff --git a/ox/web/archive.py b/ox/web/archive.py index 3e7ab47..0c733c3 100644 --- a/ox/web/archive.py +++ b/ox/web/archive.py @@ -3,6 +3,8 @@ from .. import cache from ..utils import json +from six import string_types + def get_id(url): return url.split("/")[-1] @@ -19,7 +21,7 @@ def get_data(id): data[key] = details['metadata'][key] if isinstance(data[key], list): data[key] = data[key][0] - if isinstance(data[key], str): + if isinstance(data[key], string_types): data[key] = data[key].strip() if data[key][0] == '[' and data[key][-1] == ']': data[key] = data[key][1:-1] diff --git a/ox/web/arsenalberlin.py b/ox/web/arsenalberlin.py index ca77b5e..e5a0dd2 100644 --- a/ox/web/arsenalberlin.py +++ b/ox/web/arsenalberlin.py @@ -19,18 +19,18 @@ def get_data(id, language='en'): if 'Willkommen in der Datenbank des Arsenal' in html: return None data = {} - data['id'] = id - data['url'] = url + data[u'id'] = id + data[u'url'] = url m = re.compile('

(.*?)

').findall(html) if m: - data['title'] = m[0] + data[u'title'] = m[0] m = re.compile("Director: (.*?)").findall(html) if m: - data['director'] = m[0] + data[u'director'] = m[0] m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html) if m: - data['image'] = m[0] + data[u'image'] = m[0] units = re.compile("
(.*?)
", re.DOTALL).findall(html) for x in map(re.compile('(.*?): (.*)', re.DOTALL).findall, units): @@ -43,7 +43,7 @@ def get_data(id, language='en'): else: data[key] = strip_tags(data[key]) if "running time (minutes)" in data: - data['runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 + data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 for key in ('year', 'length in metres', 'forum participation year', 'number of reels'): if key in data and data[key].isdigit(): data[key] = int(data[key]) diff --git a/ox/web/criterion.py b/ox/web/criterion.py index 67d4a8a..d7914be 100644 --- a/ox/web/criterion.py +++ b/ox/web/criterion.py @@ -19,13 +19,13 @@ def get_url(id): def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): ''' >>> get_data('1333').get('imdbId') - '0060304' + u'0060304' >>> get_data('236')['posters'][0] - 'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' + u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' >>> get_data('786')['posters'][0] - 'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' + u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' ''' data = { "id": id, @@ -39,7 +39,7 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): data["number"] = find_re(html, "Spine #(\d+)") data["title"] = decode_html(find_re(html, "

(.*?)

")) - data["title"] = data["title"].split(' \u2014 The Television Version')[0].strip() + data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip() results = find_re(html, '
    (.*?)
') info = re.compile('
  • (.*?)
  • ', re.DOTALL).findall(results) info = {k: strip_tags(v).strip() for k, v in info} diff --git a/ox/web/dailymotion.py b/ox/web/dailymotion.py index 851b728..0ec8d86 100644 --- a/ox/web/dailymotion.py +++ b/ox/web/dailymotion.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re -from urllib.parse import unquote +from six.moves.urllib.parse import unquote from ox.cache import read_url diff --git a/ox/web/duckduckgo.py b/ox/web/duckduckgo.py index 35c0602..b4b3494 100644 --- a/ox/web/duckduckgo.py +++ b/ox/web/duckduckgo.py @@ -2,7 +2,7 @@ # vi:si:et:sw=4:sts=4:ts=4 import re -import urllib +from six.moves import urllib import ox from ox import strip_tags, decode_html from ox.cache import read_url diff --git a/ox/web/flixter.py b/ox/web/flixter.py index d713208..e6d6a0a 100644 --- a/ox/web/flixter.py +++ b/ox/web/flixter.py @@ -58,10 +58,10 @@ def get_data(id, timeout=-1): def get_id(url=None, imdb=None): ''' >>> get_id(imdb='0133093') - 'the-matrix' + u'the-matrix' #>>> get_id(imdb='0060304') - #'2-or-3-things-i-know-about-her' + #u'2-or-3-things-i-know-about-her' ''' if imdb: i = ImdbCombined(imdb) diff --git a/ox/web/google.py b/ox/web/google.py index 01bb7ce..0842d01 100644 --- a/ox/web/google.py +++ b/ox/web/google.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re -import urllib +from six.moves import urllib import ox from ox import strip_tags, decode_html diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 4b08cab..96454f7 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -7,7 +7,8 @@ import re import time import unicodedata -from urllib.parse import urlencode +from six.moves.urllib.parse import urlencode +from six import string_types from .. import find_re, strip_tags, decode_html from .. import cache @@ -448,7 +449,7 @@ class Imdb(SiteParser): if 'alternativeTitles' in self: if len(self['alternativeTitles']) == 2 and \ - isinstance(self['alternativeTitles'][0], str): + isinstance(self['alternativeTitles'][0], string_types): self['alternativeTitles'] = [self['alternativeTitles']] for key in ('country', 'genre', 'language', 'sound', 'color'): @@ -513,7 +514,7 @@ class Imdb(SiteParser): self['sound'] = list(sorted(set(self['sound']))) if 'cast' in self: - if isinstance(self['cast'][0], str): + if isinstance(self['cast'][0], string_types): self['cast'] = [self['cast']] self['actor'] = [c[0] for c in self['cast']] def cleanup_character(c): diff --git a/ox/web/itunes.py b/ox/web/itunes.py index bb85952..f599099 100644 --- a/ox/web/itunes.py +++ b/ox/web/itunes.py @@ -2,7 +2,7 @@ # encoding: utf-8 from __future__ import print_function import re -from urllib.parse import urlencode +from six.moves.urllib.parse import urlencode from ox.cache import read_url from ox.html import decode_html, strip_tags diff --git a/ox/web/metacritic.py b/ox/web/metacritic.py index 8c59998..2ecded5 100644 --- a/ox/web/metacritic.py +++ b/ox/web/metacritic.py @@ -2,7 +2,7 @@ # vi:si:et:sw=4:sts=4:ts=4 import re -from urllib.parse import quote +from six.moves.urllib.parse import quote from lxml.html import document_fromstring from ox.cache import read_url diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index b260be6..b8b78f8 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -4,6 +4,8 @@ import re import json from multiprocessing.pool import ThreadPool +from six import string_types + from ..cache import read_url from .. import decode_html from ..utils import datetime @@ -11,15 +13,15 @@ from ..utils import datetime def cleanup(key, data, data_type): if data: - if isinstance(data[0], str): + if isinstance(data[0], string_types): #FIXME: some types need strip_tags #data = [strip_tags(decode_html(p)).strip() for p in data] data = [decode_html(p).strip() for p in data] elif isinstance(data[0], list) or isinstance(data[0], tuple): data = [cleanup(key, p, data_type) for p in data] - while len(data) == 1 and not isinstance(data, str): + while len(data) == 1 and not isinstance(data, string_types): data = data[0] - if data_type == 'list' and isinstance(data, str): + if data_type == 'list' and isinstance(data, string_types): data = [data, ] elif data_type != 'list': data = '' @@ -47,7 +49,7 @@ class SiteParser(dict): for key in self.regex: url = self.get_url(self.regex[key]['page']) data = self.read_url(url, timeout) - if isinstance(self.regex[key]['re'], str): + if isinstance(self.regex[key]['re'], string_types): data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data) data = cleanup(key, data, self.regex[key]['type']) elif callable(self.regex[key]['re']): @@ -58,7 +60,7 @@ class SiteParser(dict): f = r else: f = re.compile(r, re.DOTALL).findall - if isinstance(data, str): + if isinstance(data, string_types): data = f(data) else: data = [f(d) for d in data] diff --git a/ox/web/startpage.py b/ox/web/startpage.py index 83e92f9..ca18437 100644 --- a/ox/web/startpage.py +++ b/ox/web/startpage.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 -import urllib +from six.moves import urllib import lxml.html import ox diff --git a/ox/web/thepiratebay.py b/ox/web/thepiratebay.py index cbbdf56..e9a6445 100644 --- a/ox/web/thepiratebay.py +++ b/ox/web/thepiratebay.py @@ -3,7 +3,7 @@ from datetime import datetime import re -from urllib.parse import quote +from six.moves.urllib.parse import quote from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normalize_newlines from ox.normalize import normalize_imdbid diff --git a/ox/web/twitter.py b/ox/web/twitter.py index 619c458..fa33bfc 100644 --- a/ox/web/twitter.py +++ b/ox/web/twitter.py @@ -2,7 +2,7 @@ # vi:si:et:sw=4:sts=4:ts=4 import re from datetime import datetime -from urllib.parse import quote +from six.moves.urllib.parse import quote import lxml.html import ox diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py index 5d86655..de8b064 100644 --- a/ox/web/wikipedia.py +++ b/ox/web/wikipedia.py @@ -4,7 +4,8 @@ from __future__ import print_function import re -import urllib +from six.moves import urllib +from six import string_types from ox.utils import json from ox.cache import read_url @@ -68,7 +69,7 @@ def get_movie_data(wikipedia_url): value = value.split('
    ') if value: if key in filmbox: - if isinstance(value, list) and isinstance(filmbox[key], str): + if isinstance(value, list) and isinstance(filmbox[key], string_types): filmbox[key] = [filmbox[key]] + value else: filmbox[key] += value diff --git a/ox/web/youtube.py b/ox/web/youtube.py index 0f59b80..805f716 100644 --- a/ox/web/youtube.py +++ b/ox/web/youtube.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 -from urllib.parse import quote, unquote_plus -import urllib -from http import cookiejar as cookielib +from six.moves.urllib.parse import quote, unquote_plus +from six.moves import urllib +from six.moves import http_cookiejar as cookielib import re from xml.dom.minidom import parseString import json diff --git a/requirements.txt b/requirements.txt index f30c448..4e7d966 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ chardet +six>=1.5.2 lxml requests diff --git a/setup.py b/setup.py index b1d1de5..e5948ea 100644 --- a/setup.py +++ b/setup.py @@ -7,10 +7,9 @@ try: except: from distutils.core import setup -def get_git_version(): +def get_revision(): import subprocess - version = subprocess.check_output(['git', 'describe', '--tags']).decode().strip().replace('-', '.') - return '.'.join((version.split('.') + ['0'])[:3]) + return subprocess.check_output(['git', 'rev-list', 'HEAD', '--count']).decode().strip() def get_version(): import os @@ -19,8 +18,9 @@ def get_version(): __version = os.path.join(os.path.dirname(__file__), 'ox/__version.py') changelog = os.path.join(os.path.dirname(__file__), 'debian/changelog') if os.path.exists(_git): - version = get_git_version() - if version: + rev = get_revision() + if rev: + version = "2.3.%s" % rev with open(__version, 'w') as fd: fd.write('VERSION="%s"' % version) return version @@ -37,8 +37,8 @@ def get_version(): f.close() rev = re.compile('\d+\.\d+\.(\d+)').findall(head) if rev: - return '3.0.%s' % rev[0] - return '3.0.x' + return '2.3.%s' % rev[0] + return '2.3.x' setup( @@ -50,18 +50,17 @@ setup( url="https://code.0x2620.org/0x2620/python-ox", license="GPLv3", packages=['ox', 'ox.torrent', 'ox.web'], - install_requires=['chardet'], + install_requires=['six>=1.5.2', 'chardet'], keywords=[ ], classifiers=[ 'Operating System :: OS Independent', 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', 'Topic :: Software Development :: Libraries :: Python Modules', ], )