add some functions

This commit is contained in:
j 2008-04-27 18:54:37 +02:00
commit ef59090610
11 changed files with 649 additions and 0 deletions

18
README Normal file
View file

@ -0,0 +1,18 @@
python-oxutils some tools to build tools
Depends:
python2.5
python-chardet (http://chardet.feedparser.org/)
BitTornado(optional)
Usage:
import oxutils
data = oxutils.cache.readUrl('http:/...')
text = oxutils.stripTags(data)
oxutils.normalizeNewlines(text)
oxutils.formatBytes(len(data))
oxutils.formatBytes(1234567890)
'1.15 GB'

18
oxutils/__init__.py Normal file
View file

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# Written 2008 by j@mailb.org
from hashes import *
from html import *
from numbers import *
from text import *
from timeformat import *
import net
import cache
#only works if BitTornado is installed
try:
from bt import *
except:
pass

53
oxutils/bt.py Normal file
View file

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
# Written 2007 by j@mailb.org
from threading import Event
import sha
from os import stat
from BitTornado.BT1.makemetafile import make_meta_file
from BitTornado.bencode import bencode, bdecode
def createTorrent(file, url, params = {}, flag = Event(),
progress = lambda x: None, progress_percent = 1):
"Creates a torrent for a given file, using url as tracker url"
return make_meta_file(file, url, params, flag, progress, progress_percent)
def getInfoHash(torrentFile):
"Returns Torrent Info Hash from torrent file"
metainfo_file = open(torrentFile, 'rb')
metainfo = bdecode(metainfo_file.read())
info = metainfo['info']
return sha.sha(bencode(info)).hexdigest().upper()
def getTorrentInfo(torrentFile):
"Returns Torrent Info from torrent file"
tinfo = {}
metainfo_file = open(torrentFile, 'rb')
metainfo = bdecode(metainfo_file.read())
metainfo_file.close()
info = metainfo['info']
piece_length = info['piece length']
if info.has_key('length'):
# let's assume we just have one file
file_length = info['length']
else:
# let's assume we have a directory structure
file_length = 0;
for file in info['files']:
path = ''
for item in file['path']:
if (path != ''):
path = path + "/"
path = path + item
file_length += file['length']
tinfo['size'] = file_length
tinfo['hash'] = sha.sha(bencode(info)).hexdigest()
tinfo['timestamp'] = stat(torrentFile).st_ctime
return tinfo
def getTorrentSize(torrentFile):
"Returns Size of files in torrent file in bytes"
return getTorrentInfo(torrentFile)['size']

62
oxutils/cache.py Normal file
View file

@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import os
import sha
import time
import urlparse
import net
from net import DEFAULT_HEADERS
cache_timeout = 30*24*60*60 # default is 30 days
def getUrlUnicode(url):
data = getUrl(url)
encoding = chardet.detect(data)['encoding']
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
url_cache_file = getUrlCacheFile(url, data, headers)
result = loadUrlCache(url_cache_file, timeout)
if not result:
result = net.getUrl(url, data, headers)
saveUrlCache(url_cache_file, result)
return result
def getCacheBase():
'cache base is eather ~/.ox/cache or can set via env variable oxCACHE'
return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))
def getUrlCacheFile(url, data=None, headers=DEFAULT_HEADERS):
if data:
url_hash = sha.sha(url + '?' + data).hexdigest()
else:
url_hash = sha.sha(url).hexdigest()
domain = urlparse.urlparse(url)[1]
return os.path.join(getCacheBase(), domain, url_hash[:2], url_hash[2:4], url_hash[4:6], url_hash)
def loadUrlCache(url_cache_file, data, timeout=cache_timeout):
if timeout <= 0:
return None
if os.path.exists(url_cache_file):
ctime = os.stat(url_cache_file).st_ctime
now = time.mktime(time.localtime())
file_age = now-ctime
if file_age < timeout:
f = open(url_cache_file)
data = f.read()
f.close()
return data
return None
def saveUrlCache(url_cache_file, data):
folder = os.path.dirname(url_cache_file)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(url_cache_file, 'w')
f.write(data)
f.close()

17
oxutils/hashes.py Normal file
View file

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# GPL written 2008 by j@pad.ma
import sha
import os
def sha1sum(filename):
sha1 = sha.new()
file=open(filename)
buffer=file.read(4096)
while buffer:
sha1.update(buffer)
buffer=file.read(4096)
file.close()
return sha1.hexdigest()

128
oxutils/html.py Normal file
View file

@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# GPL written 2008 by j@pad.ma
import re
import string
# Configuration for urlize() function
LEADING_PUNCTUATION = ['(', '<', '&lt;']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;', "'", '"']
# list of possible strings used for bullets in bulleted lists
DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
word_split_re = re.compile(r'(\s+)')
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
del x # Temporary variable
def escape(html):
"Returns the given HTML with ampersands, quotes and carets encoded"
if not isinstance(html, basestring):
html = str(html)
return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')
def linebreaks(value):
"Converts newlines into <p> and <br />s"
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
paras = re.split('\n{2,}', value)
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
return '\n\n'.join(paras)
def stripTags(value):
"Returns the given HTML with all tags stripped"
return re.sub(r'<[^>]*?>', '', value)
def stripSpacesBetweenTags(value):
"Returns the given HTML with spaces between tags normalized to a single space"
return re.sub(r'>\s+<', '> <', value)
def stripEntities(value):
"Returns the given HTML with all entities (&something;) stripped"
return re.sub(r'&(?:\w+|#\d);', '', value)
def fixAmpersands(value):
"Returns the given HTML with all unencoded ampersands encoded correctly"
return unencoded_ampersands_re.sub('&amp;', value)
def urlize(text, trim_url_limit=None, nofollow=False):
"""
Converts any URLs in text into clickable links. Works on http://, https:// and
www. links. Links can have trailing punctuation (periods, commas, close-parens)
and leading punctuation (opening parens) and it'll still do the right thing.
If trim_url_limit is not None, the URLs in link text will be limited to
trim_url_limit characters.
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
"""
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or ''
for i, word in enumerate(words):
match = punctuation_re.match(word)
if match:
lead, middle, trail = match.groups()
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
len(middle) > 0 and middle[0] in string.letters + string.digits and \
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if middle.startswith('http://') or middle.startswith('https://'):
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
and simple_email_re.match(middle):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word:
words[i] = lead + middle + trail
return ''.join(words)
def cleanHtml(text):
"""
Cleans the given HTML. Specifically, it does the following:
* Converts <b> and <i> to <strong> and <em>.
* Encodes all ampersands correctly.
* Removes all "target" attributes from <a> tags.
* Removes extraneous HTML, such as presentational tags that open and
immediately close and <br clear="all">.
* Converts hard-coded bullets into HTML unordered lists.
* Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
bottom of the text.
"""
from text import normalizeNewlines
text = normalizeNewlines(text)
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
text = fix_ampersands(text)
# Remove all target="" attributes from <a> tags.
text = link_target_attribute_re.sub('\\1', text)
# Trim stupid HTML such as <br clear="all">.
text = html_gunk_re.sub('', text)
# Convert hard-coded bullets into HTML unordered lists.
def replace_p_tags(match):
s = match.group().replace('</p>', '</li>')
for d in DOTS:
s = s.replace('<p>%s' % d, '<li>')
return '<ul>\n%s\n</ul>' % s
text = hard_coded_bullets_re.sub(replace_p_tags, text)
# Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom of the text.
text = trailing_empty_content_re.sub('', text)
return text
def highlight(text, query, hlClass="hl"):
if query:
text = text.replace('<br />', '|')
query = re.escape(query).replace('\ ', '.')
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
for i in m:
text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
text = text.replace('|', '<br />')
return text

29
oxutils/net.py Normal file
View file

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import urllib
import urllib2
import chardet
# Default headers for HTTP requests.
DEFAULT_HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9b5) Gecko/2008041514 Firefox/3.0b5'}
def openUrl(url, data=None, headers=DEFAULT_HEADERS):
url = url.replace(' ', '%20')
req = urllib2.Request(url, data, headers)
return urllib2.urlopen(req)
def getUrl(url, data=None, headers=DEFAULT_HEADERS):
f = openUrl(url, data, headers)
data = f.read()
f.close()
return data
def getUrlUnicode(url):
data = getUrl(url)
encoding = chardet.detect(data)['encoding']
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)

99
oxutils/numbers.py Normal file
View file

@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# Written 2007 by j@mailb.org
import re
def to36(q):
"""
Converts an integer to base 36 (a useful scheme for human-sayable IDs).
>>> to36(35)
'z'
>>> to36(119292)
'2k1o'
>>> int(to36(939387374), 36)
939387374
>>> to36(0)
'0'
>>> to36(-393)
Traceback (most recent call last):
...
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError, "must supply a positive integer"
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
converted = []
while q != 0:
q, r = divmod(q, 36)
converted.insert(0, letters[r])
return "".join(converted) or '0'
def from36(q):
return int(q, 36)
def intValue(strValue, default=''):
try:
val = re.compile('(\d*)').findall(unicode(strValue))[0]
except:
val = default
return val
def floatValue(strValue, default=''):
try:
val = re.compile('([\d.]*)').findall(unicode(strValue))[0]
except:
val = default
return val
"""
Format the value like a 'human-readable' file size (i.e. 13 KB, 4.1 MB, 102
bytes, etc).
number - number to format.
long_name - long name. i.e. byte
short - short name, i.e. B
"""
def formatNumber(number, long_name, short):
if not number:
return "0 %ss" % long_name
number = float(number)
if number < 1024:
return "%d %s%s" % (number, long_name, number != 1 and 's' or '')
if number < 1024 * 1024:
return "%d K%s" % ((number / 1024), short)
if number < 1024 * 1024 * 1024:
return "%.1f M%s" % (number / (1024 * 1024), short)
if number < 1024 * 1024 * 1024 * 1024:
return "%.2f G%s" % (number / (1024 * 1024 * 1024), short)
return "%.3f T%s" % (number / (1024 * 1024 * 1024 * 1024), short)
def formatBytes(number):
return formatNumber(number, 'byte', 'B')
def formatBit(number):
return formatNumber(number, 'bit', 'b')
'''
seperate number with thousand comma
'''
def numberThousands(n, sep=','):
if n < 1000:
return "%s" % n
ln = list(str(n))
ln.reverse()
newn = []
while len(ln) > 3:
newn.extend(ln[:3])
newn.append(sep)
ln = ln[3:]
newn.extend(ln)
newn.reverse()
return "".join(newn)
def plural(amount, unit, plural='s'):
if abs(amount) != 1:
if plural == 's':
unit = unit + plural
else: unit = plural
return "%s %s" % (formatNumber(amount), unit)

140
oxutils/text.py Normal file
View file

@ -0,0 +1,140 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# GPL written 2008 by j@pad.ma
import re
# Capitalizes the first letter of a string.
capfirst = lambda x: x and x[0].upper() + x[1:]
def removeSpecialCharacters(text):
"""
Removes special characters inserted by Word.
"""
text = text.replace(u'\u2013', '-')
text = text.replace(u'\u2026O', "'")
text = text.replace(u'\u2019', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "-")
return text
def wrap(text, width):
"""
A word-wrap function that preserves existing line breaks and most spaces in
the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
"""
return reduce(lambda line, word, width=width: '%s%s%s' %
(line,
' \n'[(len(line[line.rfind('\n')+1:])
+ len(word.split('\n',1)[0]
) >= width)],
word),
text.split(' ')
)
def truncateString(s, num):
"Truncates a string after a certain number of chacters, but ends with a word"
length = int(num)
if len(s) <= length:
return s
words = s.split()
ts = ""
while words and len(ts) + len(words[0]) < length:
ts += " " + words.pop(0)
if words:
ts += "..."
return ts
def truncateWords(s, num):
"Truncates a string after a certain number of words."
length = int(num)
words = s.split()
if len(words) > length:
words = words[:length]
if not words[-1].endswith('...'):
words.append('...')
return ' '.join(words)
def getValidFilename(s):
"""
Returns the given string converted to a string that can be used for a clean
filename. Specifically, leading and trailing spaces are removed;
all non-filename-safe characters are removed.
>>> get_valid_filename("john's portrait in 2004.jpg")
'john_s portrait in 2004.jpg'
"""
s = s.strip()
s = s.replace(' ', '_')
s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
s = s.replace('__', '_').replace('__', '_')
return s
def getTextList(list_, last_word='or'):
"""
>>> get_text_list(['a', 'b', 'c', 'd'])
'a, b, c or d'
>>> get_text_list(['a', 'b', 'c'], 'and')
'a, b and c'
>>> get_text_list(['a', 'b'], 'and')
'a and b'
>>> get_text_list(['a'])
'a'
>>> get_text_list([])
''
"""
if len(list_) == 0: return ''
if len(list_) == 1: return list_[0]
return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1])
def normalizeNewlines(text):
return re.sub(r'\r\n|\r|\n', '\n', text)
def recapitalize(text):
"Recapitalizes text, placing caps after end-of-sentence punctuation."
# capwords = ()
text = text.lower()
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
text = capsRE.sub(lambda x: x.group(1).upper(), text)
# for capword in capwords:
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
# text = capwordRE.sub(capword, text)
return text
def phone2numeric(phone):
"Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I)
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'}.get(m.group(0).lower())
return letters.sub(char2number, phone)
def compressString(s):
import cStringIO, gzip
zbuf = cStringIO.StringIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s)
zfile.close()
return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smartSplit(text):
"""
Generator that splits a string by spaces, leaving quoted phrases together.
Supports both single and double quotes, and supports escaping quotes with
backslashes. In the output, strings will keep their initial and trailing
quote marks.
>>> list(smart_split('This is "a person\'s" test.'))
['This', 'is', '"a person\'s"', 'test.']
"""
for bit in smart_split_re.finditer(text):
bit = bit.group(0)
if bit[0] == '"':
yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
elif bit[0] == "'":
yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
else:
yield bit

58
oxutils/timeformat.py Normal file
View file

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
from numbers import plural
def ms2runtime(ms):
seconds = int(ms / 1000)
years = 0
days = 0
hours = 0
minutes = 0
if seconds >= 60:
minutes = int(seconds / 60)
seconds = seconds % 60
if minutes >= 60:
hours = int(minutes / 60)
minutes = minutes % 60
if hours >= 24:
days = int(hours / 24)
hours = hours % 24
if days >= 365:
years = int(days / 365)
days = days % 365
runtimeString = (plural(years, 'year'), plural(days, 'day'),
plural(hours,'hour'), plural(minutes, 'minute'), plural(seconds, 'second'))
runtimeString = filter(lambda x: not x.startswith('0'), runtimeString)
return " ".join(runtimeString).strip()
def ms2playtime(ms):
it = int(ms / 1000)
ms = ms - it*1000
ss = it % 60
mm = ((it-ss)/60) % 60
hh = ((it-(mm*60)-ss)/3600) % 60
if hh:
playtime= "%02d:%02d:%02d" % (hh, mm, ss)
else:
playtime= "%02d:%02d" % (mm, ss)
return playtime
def ms2time(ms):
it = int(ms / 1000)
ms = ms - it*1000
ss = it % 60
mm = ((it-ss)/60) % 60
hh = ((it-(mm*60)-ss)/3600) % 60
return "%d:%02d:%02d.%03d" % (hh, mm, ss, ms)
def time2ms(timeString):
ms = 0.0
p = timeString.split(':')
for i in range(len(p)):
ms = ms * 60 + float(p[i])
return int(ms * 1000)
def shiftTime(offset, timeString):
newTime = time2ms(timeString) + offset
return ms2time(newTime)

27
setup.py Normal file
View file

@ -0,0 +1,27 @@
#!/usr/bin/env python
# vi:si:et:sw=2:sts=2:ts=2
# encoding: utf-8
from setuptools import setup, find_packages
setup(
name="oxutils",
version="0.1",
description="collection of utils used to work with python",
author="ox",
author_email="utils@0xdb.org",
url="http://code.0xdb.org/python-oxutils",
download_url="http://code.0xdb.org/python-oxutils/download",
license="GPL",
packages=find_packages(),
zip_safe=False,
keywords = [
],
classifiers = [
'Development Status :: 3 - Alpha',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Topic :: Software Development :: Libraries :: Python Modules',
],
)