From ef5909061074ce2f79fdfdfe79e870c5a30aec34 Mon Sep 17 00:00:00 2001 From: j Date: Sun, 27 Apr 2008 18:54:37 +0200 Subject: [PATCH] add some functions --- README | 18 ++++++ oxutils/__init__.py | 18 ++++++ oxutils/bt.py | 53 ++++++++++++++++ oxutils/cache.py | 62 +++++++++++++++++++ oxutils/hashes.py | 17 +++++ oxutils/html.py | 128 ++++++++++++++++++++++++++++++++++++++ oxutils/net.py | 29 +++++++++ oxutils/numbers.py | 99 +++++++++++++++++++++++++++++ oxutils/text.py | 140 ++++++++++++++++++++++++++++++++++++++++++ oxutils/timeformat.py | 58 +++++++++++++++++ setup.py | 27 ++++++++ 11 files changed, 649 insertions(+) create mode 100644 README create mode 100644 oxutils/__init__.py create mode 100644 oxutils/bt.py create mode 100644 oxutils/cache.py create mode 100644 oxutils/hashes.py create mode 100644 oxutils/html.py create mode 100644 oxutils/net.py create mode 100644 oxutils/numbers.py create mode 100644 oxutils/text.py create mode 100644 oxutils/timeformat.py create mode 100644 setup.py diff --git a/README b/README new file mode 100644 index 0000000..728d6dc --- /dev/null +++ b/README @@ -0,0 +1,18 @@ +python-oxutils some tools to build tools + +Depends: + python2.5 + python-chardet (http://chardet.feedparser.org/) + BitTornado(optional) + +Usage: + import oxutils + + data = oxutils.cache.readUrl('http:/...') + text = oxutils.stripTags(data) + oxutils.normalizeNewlines(text) + oxutils.formatBytes(len(data)) + + oxutils.formatBytes(1234567890) + '1.15 GB' + diff --git a/oxutils/__init__.py b/oxutils/__init__.py new file mode 100644 index 0000000..235518d --- /dev/null +++ b/oxutils/__init__.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +# Written 2008 by j@mailb.org + +from hashes import * +from html import * +from numbers import * +from text import * +from timeformat import * +import net +import cache + +#only works if BitTornado is installed +try: + from bt import * +except: + pass + diff --git a/oxutils/bt.py b/oxutils/bt.py new file mode 100644 index 0000000..0feaced --- /dev/null +++ b/oxutils/bt.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +# Written 2007 by j@mailb.org + +from threading import Event +import sha +from os import stat + +from BitTornado.BT1.makemetafile import make_meta_file +from BitTornado.bencode import bencode, bdecode + + +def createTorrent(file, url, params = {}, flag = Event(), + progress = lambda x: None, progress_percent = 1): + "Creates a torrent for a given file, using url as tracker url" + return make_meta_file(file, url, params, flag, progress, progress_percent) + +def getInfoHash(torrentFile): + "Returns Torrent Info Hash from torrent file" + metainfo_file = open(torrentFile, 'rb') + metainfo = bdecode(metainfo_file.read()) + info = metainfo['info'] + return sha.sha(bencode(info)).hexdigest().upper() + +def getTorrentInfo(torrentFile): + "Returns Torrent Info from torrent file" + tinfo = {} + metainfo_file = open(torrentFile, 'rb') + metainfo = bdecode(metainfo_file.read()) + metainfo_file.close() + info = metainfo['info'] + piece_length = info['piece length'] + if info.has_key('length'): + # let's assume we just have one file + file_length = info['length'] + else: + # let's assume we have a directory structure + file_length = 0; + for file in info['files']: + path = '' + for item in file['path']: + if (path != ''): + path = path + "/" + path = path + item + file_length += file['length'] + tinfo['size'] = file_length + tinfo['hash'] = sha.sha(bencode(info)).hexdigest() + tinfo['timestamp'] = stat(torrentFile).st_ctime + return tinfo + +def getTorrentSize(torrentFile): + "Returns Size of files in torrent file in bytes" + return getTorrentInfo(torrentFile)['size'] + diff --git a/oxutils/cache.py b/oxutils/cache.py new file mode 100644 index 0000000..912fd43 --- /dev/null +++ b/oxutils/cache.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +import os +import sha +import time +import urlparse + +import net +from net import DEFAULT_HEADERS + + +cache_timeout = 30*24*60*60 # default is 30 days + +def getUrlUnicode(url): + data = getUrl(url) + encoding = chardet.detect(data)['encoding'] + if not encoding: + encoding = 'latin-1' + return unicode(data, encoding) + +def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): + url_cache_file = getUrlCacheFile(url, data, headers) + result = loadUrlCache(url_cache_file, timeout) + if not result: + result = net.getUrl(url, data, headers) + saveUrlCache(url_cache_file, result) + return result + +def getCacheBase(): + 'cache base is eather ~/.ox/cache or can set via env variable oxCACHE' + return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache')) + +def getUrlCacheFile(url, data=None, headers=DEFAULT_HEADERS): + if data: + url_hash = sha.sha(url + '?' + data).hexdigest() + else: + url_hash = sha.sha(url).hexdigest() + domain = urlparse.urlparse(url)[1] + return os.path.join(getCacheBase(), domain, url_hash[:2], url_hash[2:4], url_hash[4:6], url_hash) + +def loadUrlCache(url_cache_file, data, timeout=cache_timeout): + if timeout <= 0: + return None + if os.path.exists(url_cache_file): + ctime = os.stat(url_cache_file).st_ctime + now = time.mktime(time.localtime()) + file_age = now-ctime + if file_age < timeout: + f = open(url_cache_file) + data = f.read() + f.close() + return data + return None + +def saveUrlCache(url_cache_file, data): + folder = os.path.dirname(url_cache_file) + if not os.path.exists(folder): + os.makedirs(folder) + f = open(url_cache_file, 'w') + f.write(data) + f.close() + diff --git a/oxutils/hashes.py b/oxutils/hashes.py new file mode 100644 index 0000000..800c104 --- /dev/null +++ b/oxutils/hashes.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +# GPL written 2008 by j@pad.ma +import sha +import os + +def sha1sum(filename): + sha1 = sha.new() + file=open(filename) + buffer=file.read(4096) + while buffer: + sha1.update(buffer) + buffer=file.read(4096) + file.close() + return sha1.hexdigest() + + diff --git a/oxutils/html.py b/oxutils/html.py new file mode 100644 index 0000000..aaddbf6 --- /dev/null +++ b/oxutils/html.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +# GPL written 2008 by j@pad.ma +import re +import string + + +# Configuration for urlize() function +LEADING_PUNCTUATION = ['(', '<', '<'] +TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>', "'", '"'] + +# list of possible strings used for bullets in bulleted lists +DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•'] + +unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)') +word_split_re = re.compile(r'(\s+)') +punctuation_re = re.compile('^(?P(?:%s)*)(?P.*?)(?P(?:%s)*)$' % \ + ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]), + '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION]))) +simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') +link_target_attribute_re = re.compile(r'(]*?)target=[^\s>]+') +html_gunk_re = re.compile(r'(?:
|<\/i>|<\/b>|<\/em>|<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) +hard_coded_bullets_re = re.compile(r'((?:

(?:%s).*?[a-zA-Z].*?

\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) +trailing_empty_content_re = re.compile(r'(?:

(?: |\s|
)*?

\s*)+\Z') +del x # Temporary variable + +def escape(html): + "Returns the given HTML with ampersands, quotes and carets encoded" + if not isinstance(html, basestring): + html = str(html) + return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') + +def linebreaks(value): + "Converts newlines into

and
s" + value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines + paras = re.split('\n{2,}', value) + paras = ['

%s

' % p.strip().replace('\n', '
') for p in paras] + return '\n\n'.join(paras) + +def stripTags(value): + "Returns the given HTML with all tags stripped" + return re.sub(r'<[^>]*?>', '', value) + +def stripSpacesBetweenTags(value): + "Returns the given HTML with spaces between tags normalized to a single space" + return re.sub(r'>\s+<', '> <', value) + +def stripEntities(value): + "Returns the given HTML with all entities (&something;) stripped" + return re.sub(r'&(?:\w+|#\d);', '', value) + +def fixAmpersands(value): + "Returns the given HTML with all unencoded ampersands encoded correctly" + return unencoded_ampersands_re.sub('&', value) + +def urlize(text, trim_url_limit=None, nofollow=False): + """ + Converts any URLs in text into clickable links. Works on http://, https:// and + www. links. Links can have trailing punctuation (periods, commas, close-parens) + and leading punctuation (opening parens) and it'll still do the right thing. + + If trim_url_limit is not None, the URLs in link text will be limited to + trim_url_limit characters. + + If nofollow is True, the URLs in link text will get a rel="nofollow" attribute. + """ + trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x + words = word_split_re.split(text) + nofollow_attr = nofollow and ' rel="nofollow"' or '' + for i, word in enumerate(words): + match = punctuation_re.match(word) + if match: + lead, middle, trail = match.groups() + if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \ + len(middle) > 0 and middle[0] in string.letters + string.digits and \ + (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))): + middle = '
%s' % (middle, nofollow_attr, trim_url(middle)) + if middle.startswith('http://') or middle.startswith('https://'): + middle = '%s' % (middle, nofollow_attr, trim_url(middle)) + if '@' in middle and not middle.startswith('www.') and not ':' in middle \ + and simple_email_re.match(middle): + middle = '%s' % (middle, middle) + if lead + middle + trail != word: + words[i] = lead + middle + trail + return ''.join(words) + +def cleanHtml(text): + """ + Cleans the given HTML. Specifically, it does the following: + * Converts and to and . + * Encodes all ampersands correctly. + * Removes all "target" attributes from tags. + * Removes extraneous HTML, such as presentational tags that open and + immediately close and
. + * Converts hard-coded bullets into HTML unordered lists. + * Removes stuff like "

  

", but only if it's at the + bottom of the text. + """ + from text import normalizeNewlines + text = normalizeNewlines(text) + text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text) + text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text) + text = fix_ampersands(text) + # Remove all target="" attributes from
tags. + text = link_target_attribute_re.sub('\\1', text) + # Trim stupid HTML such as
. + text = html_gunk_re.sub('', text) + # Convert hard-coded bullets into HTML unordered lists. + def replace_p_tags(match): + s = match.group().replace('

', '') + for d in DOTS: + s = s.replace('

%s' % d, '

  • ') + return '
      \n%s\n
    ' % s + text = hard_coded_bullets_re.sub(replace_p_tags, text) + # Remove stuff like "

      

    ", but only if it's at the bottom of the text. + text = trailing_empty_content_re.sub('', text) + return text + +def highlight(text, query, hlClass="hl"): + if query: + text = text.replace('
    ', '|') + query = re.escape(query).replace('\ ', '.') + m = re.compile("(%s)" % query, re.IGNORECASE).findall(text) + for i in m: + text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '\\1' % hlClass, text) + text = text.replace('|', '
    ') + return text + diff --git a/oxutils/net.py b/oxutils/net.py new file mode 100644 index 0000000..1b3f609 --- /dev/null +++ b/oxutils/net.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +import urllib +import urllib2 + +import chardet + + +# Default headers for HTTP requests. +DEFAULT_HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9b5) Gecko/2008041514 Firefox/3.0b5'} + +def openUrl(url, data=None, headers=DEFAULT_HEADERS): + url = url.replace(' ', '%20') + req = urllib2.Request(url, data, headers) + return urllib2.urlopen(req) + +def getUrl(url, data=None, headers=DEFAULT_HEADERS): + f = openUrl(url, data, headers) + data = f.read() + f.close() + return data + +def getUrlUnicode(url): + data = getUrl(url) + encoding = chardet.detect(data)['encoding'] + if not encoding: + encoding = 'latin-1' + return unicode(data, encoding) + diff --git a/oxutils/numbers.py b/oxutils/numbers.py new file mode 100644 index 0000000..55a6911 --- /dev/null +++ b/oxutils/numbers.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +# Written 2007 by j@mailb.org +import re + +def to36(q): + """ + Converts an integer to base 36 (a useful scheme for human-sayable IDs). + + >>> to36(35) + 'z' + >>> to36(119292) + '2k1o' + >>> int(to36(939387374), 36) + 939387374 + >>> to36(0) + '0' + >>> to36(-393) + Traceback (most recent call last): + ... + ValueError: must supply a positive integer + + """ + if q < 0: raise ValueError, "must supply a positive integer" + letters = "0123456789abcdefghijklmnopqrstuvwxyz" + converted = [] + while q != 0: + q, r = divmod(q, 36) + converted.insert(0, letters[r]) + return "".join(converted) or '0' + +def from36(q): + return int(q, 36) + +def intValue(strValue, default=''): + try: + val = re.compile('(\d*)').findall(unicode(strValue))[0] + except: + val = default + return val + +def floatValue(strValue, default=''): + try: + val = re.compile('([\d.]*)').findall(unicode(strValue))[0] + except: + val = default + return val + +""" +Format the value like a 'human-readable' file size (i.e. 13 KB, 4.1 MB, 102 +bytes, etc). + number - number to format. + long_name - long name. i.e. byte + short - short name, i.e. B +""" +def formatNumber(number, long_name, short): + if not number: + return "0 %ss" % long_name + number = float(number) + if number < 1024: + return "%d %s%s" % (number, long_name, number != 1 and 's' or '') + if number < 1024 * 1024: + return "%d K%s" % ((number / 1024), short) + if number < 1024 * 1024 * 1024: + return "%.1f M%s" % (number / (1024 * 1024), short) + if number < 1024 * 1024 * 1024 * 1024: + return "%.2f G%s" % (number / (1024 * 1024 * 1024), short) + return "%.3f T%s" % (number / (1024 * 1024 * 1024 * 1024), short) + +def formatBytes(number): + return formatNumber(number, 'byte', 'B') + +def formatBit(number): + return formatNumber(number, 'bit', 'b') + +''' +seperate number with thousand comma +''' +def numberThousands(n, sep=','): + if n < 1000: + return "%s" % n + ln = list(str(n)) + ln.reverse() + newn = [] + while len(ln) > 3: + newn.extend(ln[:3]) + newn.append(sep) + ln = ln[3:] + newn.extend(ln) + newn.reverse() + return "".join(newn) + +def plural(amount, unit, plural='s'): + if abs(amount) != 1: + if plural == 's': + unit = unit + plural + else: unit = plural + return "%s %s" % (formatNumber(amount), unit) + diff --git a/oxutils/text.py b/oxutils/text.py new file mode 100644 index 0000000..f26837f --- /dev/null +++ b/oxutils/text.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +# GPL written 2008 by j@pad.ma +import re + + +# Capitalizes the first letter of a string. +capfirst = lambda x: x and x[0].upper() + x[1:] + +def removeSpecialCharacters(text): + """ + Removes special characters inserted by Word. + """ + text = text.replace(u'\u2013', '-') + text = text.replace(u'\u2026O', "'") + text = text.replace(u'\u2019', "'") + text = text.replace(u'', "'") + text = text.replace(u'', "'") + text = text.replace(u'', "-") + return text + +def wrap(text, width): + """ + A word-wrap function that preserves existing line breaks and most spaces in + the text. Expects that existing line breaks are posix newlines (\n). + See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061 + """ + return reduce(lambda line, word, width=width: '%s%s%s' % + (line, + ' \n'[(len(line[line.rfind('\n')+1:]) + + len(word.split('\n',1)[0] + ) >= width)], + word), + text.split(' ') + ) + +def truncateString(s, num): + "Truncates a string after a certain number of chacters, but ends with a word" + length = int(num) + if len(s) <= length: + return s + words = s.split() + ts = "" + while words and len(ts) + len(words[0]) < length: + ts += " " + words.pop(0) + if words: + ts += "..." + return ts + +def truncateWords(s, num): + "Truncates a string after a certain number of words." + length = int(num) + words = s.split() + if len(words) > length: + words = words[:length] + if not words[-1].endswith('...'): + words.append('...') + return ' '.join(words) + +def getValidFilename(s): + """ + Returns the given string converted to a string that can be used for a clean + filename. Specifically, leading and trailing spaces are removed; + all non-filename-safe characters are removed. + >>> get_valid_filename("john's portrait in 2004.jpg") + 'john_s portrait in 2004.jpg' + """ + s = s.strip() + s = s.replace(' ', '_') + s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s) + s = s.replace('__', '_').replace('__', '_') + return s + +def getTextList(list_, last_word='or'): + """ + >>> get_text_list(['a', 'b', 'c', 'd']) + 'a, b, c or d' + >>> get_text_list(['a', 'b', 'c'], 'and') + 'a, b and c' + >>> get_text_list(['a', 'b'], 'and') + 'a and b' + >>> get_text_list(['a']) + 'a' + >>> get_text_list([]) + '' + """ + if len(list_) == 0: return '' + if len(list_) == 1: return list_[0] + return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1]) + +def normalizeNewlines(text): + return re.sub(r'\r\n|\r|\n', '\n', text) + +def recapitalize(text): + "Recapitalizes text, placing caps after end-of-sentence punctuation." +# capwords = () + text = text.lower() + capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])') + text = capsRE.sub(lambda x: x.group(1).upper(), text) +# for capword in capwords: +# capwordRE = re.compile(r'\b%s\b' % capword, re.I) +# text = capwordRE.sub(capword, text) + return text + +def phone2numeric(phone): + "Converts a phone number with letters into its numeric equivalent." + letters = re.compile(r'[A-PR-Y]', re.I) + char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3', + 'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5', + 'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7', + 's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8', + 'y': '9', 'x': '9'}.get(m.group(0).lower()) + return letters.sub(char2number, phone) + +def compressString(s): + import cStringIO, gzip + zbuf = cStringIO.StringIO() + zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) + zfile.write(s) + zfile.close() + return zbuf.getvalue() + +smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)') +def smartSplit(text): + """ + Generator that splits a string by spaces, leaving quoted phrases together. + Supports both single and double quotes, and supports escaping quotes with + backslashes. In the output, strings will keep their initial and trailing + quote marks. + >>> list(smart_split('This is "a person\'s" test.')) + ['This', 'is', '"a person\'s"', 'test.'] + """ + for bit in smart_split_re.finditer(text): + bit = bit.group(0) + if bit[0] == '"': + yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"' + elif bit[0] == "'": + yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'" + else: + yield bit diff --git a/oxutils/timeformat.py b/oxutils/timeformat.py new file mode 100644 index 0000000..3628dab --- /dev/null +++ b/oxutils/timeformat.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +from numbers import plural + +def ms2runtime(ms): + seconds = int(ms / 1000) + years = 0 + days = 0 + hours = 0 + minutes = 0 + if seconds >= 60: + minutes = int(seconds / 60) + seconds = seconds % 60 + if minutes >= 60: + hours = int(minutes / 60) + minutes = minutes % 60 + if hours >= 24: + days = int(hours / 24) + hours = hours % 24 + if days >= 365: + years = int(days / 365) + days = days % 365 + runtimeString = (plural(years, 'year'), plural(days, 'day'), + plural(hours,'hour'), plural(minutes, 'minute'), plural(seconds, 'second')) + runtimeString = filter(lambda x: not x.startswith('0'), runtimeString) + return " ".join(runtimeString).strip() + +def ms2playtime(ms): + it = int(ms / 1000) + ms = ms - it*1000 + ss = it % 60 + mm = ((it-ss)/60) % 60 + hh = ((it-(mm*60)-ss)/3600) % 60 + if hh: + playtime= "%02d:%02d:%02d" % (hh, mm, ss) + else: + playtime= "%02d:%02d" % (mm, ss) + return playtime + +def ms2time(ms): + it = int(ms / 1000) + ms = ms - it*1000 + ss = it % 60 + mm = ((it-ss)/60) % 60 + hh = ((it-(mm*60)-ss)/3600) % 60 + return "%d:%02d:%02d.%03d" % (hh, mm, ss, ms) + +def time2ms(timeString): + ms = 0.0 + p = timeString.split(':') + for i in range(len(p)): + ms = ms * 60 + float(p[i]) + return int(ms * 1000) + +def shiftTime(offset, timeString): + newTime = time2ms(timeString) + offset + return ms2time(newTime) + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f1b2222 --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# vi:si:et:sw=2:sts=2:ts=2 +# encoding: utf-8 +from setuptools import setup, find_packages + +setup( + name="oxutils", + version="0.1", + + description="collection of utils used to work with python", + author="ox", + author_email="utils@0xdb.org", + url="http://code.0xdb.org/python-oxutils", + download_url="http://code.0xdb.org/python-oxutils/download", + license="GPL", + packages=find_packages(), + zip_safe=False, + keywords = [ + ], + classifiers = [ + 'Development Status :: 3 - Alpha', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Topic :: Software Development :: Libraries :: Python Modules', + ], + ) +