From 4a6e2702b4036ee768ffa829b9bd97edf6e51905 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Thu, 19 Jun 2008 11:21:21 +0200
Subject: [PATCH] vi:si:et:sw=4:sts=4:ts=4
---
oxutils/__init__.py | 6 +-
oxutils/cache.py | 154 +++++++--------
oxutils/format.py | 327 ++++++++++++++++----------------
oxutils/hashes.py | 16 +-
oxutils/html.py | 236 +++++++++++------------
oxutils/lang.py | 436 +++++++++++++++++++++----------------------
oxutils/net.py | 92 ++++-----
oxutils/normalize.py | 116 ++++++------
oxutils/text.py | 336 ++++++++++++++++-----------------
oxutils/torrent.py | 77 ++++----
setup.py | 46 ++---
11 files changed, 921 insertions(+), 921 deletions(-)
diff --git a/oxutils/__init__.py b/oxutils/__init__.py
index e12fc38..240a5c1 100644
--- a/oxutils/__init__.py
+++ b/oxutils/__init__.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
# 2008
from hashes import *
@@ -11,7 +11,7 @@ import cache
#only works if BitTornado is installed
try:
- from torrent import *
+ from torrent import *
except:
- pass
+ pass
diff --git a/oxutils/cache.py b/oxutils/cache.py
index c6bff37..e7146e1 100644
--- a/oxutils/cache.py
+++ b/oxutils/cache.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
# 2008
import gzip
import StringIO
@@ -19,99 +19,99 @@ from net import DEFAULT_HEADERS, getEncoding
cache_timeout = 30*24*60*60 # default is 30 days
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
- '''
- >>> status('http://google.com')
- 200
- >>> status('http://google.com/mysearch')
- 404
- '''
- headers = getHeaders(url, data, headers)
- return int(headers['status'])
+ '''
+ >>> status('http://google.com')
+ 200
+ >>> status('http://google.com/mysearch')
+ 404
+ '''
+ headers = getHeaders(url, data, headers)
+ return int(headers['status'])
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
- '''
- >>> exists('http://google.com')
- True
- >>> exists('http://google.com/mysearch')
- False
- '''
- s = status(url, data, headers, timeout)
- if s >= 200 and s < 400:
- return True
- return False
+ '''
+ >>> exists('http://google.com')
+ True
+ >>> exists('http://google.com/mysearch')
+ False
+ '''
+ s = status(url, data, headers, timeout)
+ if s >= 200 and s < 400:
+ return True
+ return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
- url_cache_file = "%s.headers" % getUrlCacheFile(url, data, headers)
- url_headers = loadUrlCache(url_cache_file, timeout)
- if url_headers:
- url_headers = simplejson.loads(url_headers)
- else:
- url_headers = net.getHeaders(url, data, headers)
- saveUrlHeaders(url_cache_file, url_headers)
- return url_headers
+ url_cache_file = "%s.headers" % getUrlCacheFile(url, data, headers)
+ url_headers = loadUrlCache(url_cache_file, timeout)
+ if url_headers:
+ url_headers = simplejson.loads(url_headers)
+ else:
+ url_headers = net.getHeaders(url, data, headers)
+ saveUrlHeaders(url_cache_file, url_headers)
+ return url_headers
def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
- url_cache_file = getUrlCacheFile(url, data, headers)
- result = loadUrlCache(url_cache_file, timeout)
- if not result:
- try:
- url_headers, result = net.getUrl(url, data, headers, returnHeaders=True)
- except urllib2.HTTPError, e:
- e.headers['Status'] = "%s" % e.code
- url_headers = dict(e.headers)
- result = e.read()
- if url_headers.get('content-encoding', None) == 'gzip':
- result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
- saveUrlCache(url_cache_file, result, url_headers)
- return result
+ url_cache_file = getUrlCacheFile(url, data, headers)
+ result = loadUrlCache(url_cache_file, timeout)
+ if not result:
+ try:
+ url_headers, result = net.getUrl(url, data, headers, returnHeaders=True)
+ except urllib2.HTTPError, e:
+ e.headers['Status'] = "%s" % e.code
+ url_headers = dict(e.headers)
+ result = e.read()
+ if url_headers.get('content-encoding', None) == 'gzip':
+ result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
+ saveUrlCache(url_cache_file, result, url_headers)
+ return result
def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl):
- data = _getUrl(url, data, headers, timeout)
- encoding = getEncoding(data)
- if not encoding:
- encoding = 'latin-1'
- return unicode(data, encoding)
+ data = _getUrl(url, data, headers, timeout)
+ encoding = getEncoding(data)
+ if not encoding:
+ encoding = 'latin-1'
+ return unicode(data, encoding)
def getCacheBase():
- 'cache base is eather ~/.ox/cache or can set via env variable oxCACHE'
- return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))
+ 'cache base is eather ~/.ox/cache or can set via env variable oxCACHE'
+ return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))
def getUrlCacheFile(url, data=None, headers=DEFAULT_HEADERS):
- if data:
- url_hash = sha.sha(url + '?' + data).hexdigest()
- else:
- url_hash = sha.sha(url).hexdigest()
- domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
- return os.path.join(getCacheBase(), domain, url_hash[:2], url_hash[2:4], url_hash[4:6], url_hash)
+ if data:
+ url_hash = sha.sha(url + '?' + data).hexdigest()
+ else:
+ url_hash = sha.sha(url).hexdigest()
+ domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
+ return os.path.join(getCacheBase(), domain, url_hash[:2], url_hash[2:4], url_hash[4:6], url_hash)
def loadUrlCache(url_cache_file, timeout=cache_timeout):
- if timeout == 0:
+ if timeout == 0:
+ return None
+ if os.path.exists(url_cache_file):
+ ctime = os.stat(url_cache_file).st_ctime
+ now = time.mktime(time.localtime())
+ file_age = now-ctime
+ if timeout < 0 or file_age < timeout:
+ f = open(url_cache_file)
+ data = f.read()
+ f.close()
+ return data
return None
- if os.path.exists(url_cache_file):
- ctime = os.stat(url_cache_file).st_ctime
- now = time.mktime(time.localtime())
- file_age = now-ctime
- if timeout < 0 or file_age < timeout:
- f = open(url_cache_file)
- data = f.read()
- f.close()
- return data
- return None
def saveUrlCache(url_cache_file, data, headers):
- folder = os.path.dirname(url_cache_file)
- if not os.path.exists(folder):
- os.makedirs(folder)
- f = open(url_cache_file, 'w')
- f.write(data)
- f.close()
- saveUrlHeaders("%s.headers" % url_cache_file, headers)
+ folder = os.path.dirname(url_cache_file)
+ if not os.path.exists(folder):
+ os.makedirs(folder)
+ f = open(url_cache_file, 'w')
+ f.write(data)
+ f.close()
+ saveUrlHeaders("%s.headers" % url_cache_file, headers)
def saveUrlHeaders(url_cache_file, headers):
- folder = os.path.dirname(url_cache_file)
- if not os.path.exists(folder):
- os.makedirs(folder)
- f = open(url_cache_file, 'w')
- f.write(simplejson.dumps(headers))
- f.close()
+ folder = os.path.dirname(url_cache_file)
+ if not os.path.exists(folder):
+ os.makedirs(folder)
+ f = open(url_cache_file, 'w')
+ f.write(simplejson.dumps(headers))
+ f.close()
diff --git a/oxutils/format.py b/oxutils/format.py
index b24258d..da5eb76 100644
--- a/oxutils/format.py
+++ b/oxutils/format.py
@@ -1,208 +1,207 @@
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
import math
import re
def to36(q):
- """
- Converts an integer to base 36 (a useful scheme for human-sayable IDs).
+ """
+ Converts an integer to base 36 (a useful scheme for human-sayable IDs).
- >>> to36(35)
- 'z'
- >>> to36(119292)
- '2k1o'
- >>> int(to36(939387374), 36)
- 939387374
- >>> to36(0)
- '0'
- >>> to36(-393)
- Traceback (most recent call last):
- ...
- ValueError: must supply a positive integer
- """
- if q < 0: raise ValueError, "must supply a positive integer"
- letters = "0123456789abcdefghijklmnopqrstuvwxyz"
- converted = []
- while q != 0:
- q, r = divmod(q, 36)
- converted.insert(0, letters[r])
- return "".join(converted) or '0'
+ >>> to36(35)
+ 'z'
+ >>> to36(119292)
+ '2k1o'
+ >>> int(to36(939387374), 36)
+ 939387374
+ >>> to36(0)
+ '0'
+ >>> to36(-393)
+ Traceback (most recent call last):
+ ...
+ ValueError: must supply a positive integer
+ """
+ if q < 0: raise ValueError, "must supply a positive integer"
+ letters = "0123456789abcdefghijklmnopqrstuvwxyz"
+ converted = []
+ while q != 0:
+ q, r = divmod(q, 36)
+ converted.insert(0, letters[r])
+ return "".join(converted) or '0'
def from36(q):
- return int(q, 36)
+ return int(q, 36)
def intValue(strValue, default=''):
- try:
- val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
- except:
- val = default
- return val
+ try:
+ val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
+ except:
+ val = default
+ return val
def test_intValue():
- assert intValue('abc23') == '23'
- assert intValue(' abc23') == '23'
- assert intValue(' abc') == ''
+ assert intValue('abc23') == '23'
+ assert intValue(' abc23') == '23'
+ assert intValue(' abc') == ''
def floatValue(strValue, default=''):
- try:
- val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
- except:
- val = default
- return val
+ try:
+ val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
+ except:
+ val = default
+ return val
def test_floatValue():
- print "floatValue"
- assert floatValue('abc23.4') == '23.4'
- assert floatValue(' abc23.4') == '23.4'
- assert floatValue(' abc') == ''
+ assert floatValue('abc23.4') == '23.4'
+ assert floatValue(' abc23.4') == '23.4'
+ assert floatValue(' abc') == ''
def formatNumber(number, longName, shortName):
- """
- Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
-
- >>> formatNumber(123, 'Byte', 'B')
- '123 Bytes'
+ """
+ Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
+
+ >>> formatNumber(123, 'Byte', 'B')
+ '123 Bytes'
- >>> formatNumber(1234, 'Byte', 'B')
- '1 KB'
+ >>> formatNumber(1234, 'Byte', 'B')
+ '1 KB'
- >>> formatNumber(1234567, 'Byte', 'B')
- '1.2 MB'
+ >>> formatNumber(1234567, 'Byte', 'B')
+ '1.2 MB'
- >>> formatNumber(1234567890, 'Byte', 'B')
- '1.15 GB'
+ >>> formatNumber(1234567890, 'Byte', 'B')
+ '1.15 GB'
- >>> formatNumber(1234567890123456789, 'Byte', 'B')
- '1,096.5166 PB'
+ >>> formatNumber(1234567890123456789, 'Byte', 'B')
+ '1,096.5166 PB'
- """
- if number < 1024:
- return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '')
- prefix = ['K', 'M', 'G', 'T', 'P']
- for i in range(5):
- if number < math.pow(1024, i + 2) or i == 4:
- n = number / math.pow(1024, i + 1)
- return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName)
+ """
+ if number < 1024:
+ return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '')
+ prefix = ['K', 'M', 'G', 'T', 'P']
+ for i in range(5):
+ if number < math.pow(1024, i + 2) or i == 4:
+ n = number / math.pow(1024, i + 1)
+ return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName)
def formatThousands(number, separator = ','):
- """
- Return the number with separators (1,000,000)
-
- >>> formatThousands(1)
- '1'
- >>> formatThousands(1000)
- '1,000'
- >>> formatThousands(1000000)
- '1,000,000'
- """
- string = str(number).split('.')
- l = []
- for i, character in enumerate(reversed(string[0])):
- if i and (not (i % 3)):
- l.insert(0, separator)
- l.insert(0, character)
- string[0] = ''.join(l)
- return '.'.join(string)
+ """
+ Return the number with separators (1,000,000)
+
+ >>> formatThousands(1)
+ '1'
+ >>> formatThousands(1000)
+ '1,000'
+ >>> formatThousands(1000000)
+ '1,000,000'
+ """
+ string = str(number).split('.')
+ l = []
+ for i, character in enumerate(reversed(string[0])):
+ if i and (not (i % 3)):
+ l.insert(0, separator)
+ l.insert(0, character)
+ string[0] = ''.join(l)
+ return '.'.join(string)
def formatBits(number):
- return formatNumber(number, 'bit', 'b')
+ return formatNumber(number, 'bit', 'b')
def formatBytes(number):
- return formatNumber(number, 'byte', 'B')
+ return formatNumber(number, 'byte', 'B')
def formatPixels(number):
- return formatNumber(number, 'pixel', 'px')
+ return formatNumber(number, 'pixel', 'px')
def plural(amount, unit, plural='s'):
- '''
- >>> plural(1, 'unit')
- '1 unit'
- >>> plural(2, 'unit')
- '2 units'
- '''
- if abs(amount) != 1:
- if plural == 's':
- unit = unit + plural
- else: unit = plural
- return "%s %s" % (formatThousands(amount), unit)
+ '''
+ >>> plural(1, 'unit')
+ '1 unit'
+ >>> plural(2, 'unit')
+ '2 units'
+ '''
+ if abs(amount) != 1:
+ if plural == 's':
+ unit = unit + plural
+ else: unit = plural
+ return "%s %s" % (formatThousands(amount), unit)
def ms2runtime(ms):
- '''
- >>> ms2runtime(5000)
- '5 seconds'
- >>> ms2runtime(500000)
- '8 minutes 20 seconds'
- >>> ms2runtime(50000000)
- '13 hours 53 minutes 20 seconds'
- >>> ms2runtime(50000000-20000)
- '13 hours 53 minutes'
- '''
- seconds = int(ms / 1000)
- years = 0
- days = 0
- hours = 0
- minutes = 0
- if seconds >= 60:
- minutes = int(seconds / 60)
- seconds = seconds % 60
- if minutes >= 60:
- hours = int(minutes / 60)
- minutes = minutes % 60
- if hours >= 24:
- days = int(hours / 24)
- hours = hours % 24
- if days >= 365:
- years = int(days / 365)
- days = days % 365
- runtimeString = (plural(years, 'year'), plural(days, 'day'),
- plural(hours,'hour'), plural(minutes, 'minute'), plural(seconds, 'second'))
- runtimeString = filter(lambda x: not x.startswith('0'), runtimeString)
- return " ".join(runtimeString).strip()
+ '''
+ >>> ms2runtime(5000)
+ '5 seconds'
+ >>> ms2runtime(500000)
+ '8 minutes 20 seconds'
+ >>> ms2runtime(50000000)
+ '13 hours 53 minutes 20 seconds'
+ >>> ms2runtime(50000000-20000)
+ '13 hours 53 minutes'
+ '''
+ seconds = int(ms / 1000)
+ years = 0
+ days = 0
+ hours = 0
+ minutes = 0
+ if seconds >= 60:
+ minutes = int(seconds / 60)
+ seconds = seconds % 60
+ if minutes >= 60:
+ hours = int(minutes / 60)
+ minutes = minutes % 60
+ if hours >= 24:
+ days = int(hours / 24)
+ hours = hours % 24
+ if days >= 365:
+ years = int(days / 365)
+ days = days % 365
+ runtimeString = (plural(years, 'year'), plural(days, 'day'),
+ plural(hours,'hour'), plural(minutes, 'minute'), plural(seconds, 'second'))
+ runtimeString = filter(lambda x: not x.startswith('0'), runtimeString)
+ return " ".join(runtimeString).strip()
def ms2playtime(ms):
- '''
- >>> ms2playtime(5000)
- '00:05'
- >>> ms2playtime(500000)
- '08:20'
- >>> ms2playtime(50000000)
- '13:53:20'
- '''
- it = int(ms / 1000)
- ms = ms - it*1000
- ss = it % 60
- mm = ((it-ss)/60) % 60
- hh = ((it-(mm*60)-ss)/3600) % 60
- if hh:
- playtime= "%02d:%02d:%02d" % (hh, mm, ss)
- else:
- playtime= "%02d:%02d" % (mm, ss)
- return playtime
+ '''
+ >>> ms2playtime(5000)
+ '00:05'
+ >>> ms2playtime(500000)
+ '08:20'
+ >>> ms2playtime(50000000)
+ '13:53:20'
+ '''
+ it = int(ms / 1000)
+ ms = ms - it*1000
+ ss = it % 60
+ mm = ((it-ss)/60) % 60
+ hh = ((it-(mm*60)-ss)/3600) % 60
+ if hh:
+ playtime= "%02d:%02d:%02d" % (hh, mm, ss)
+ else:
+ playtime= "%02d:%02d" % (mm, ss)
+ return playtime
def ms2time(ms):
- '''
- >>> ms2time(44592123)
- '12:23:12.123'
- '''
- it = int(ms / 1000)
- ms = ms - it*1000
- ss = it % 60
- mm = ((it-ss)/60) % 60
- hh = ((it-(mm*60)-ss)/3600) % 60
- return "%d:%02d:%02d.%03d" % (hh, mm, ss, ms)
+ '''
+ >>> ms2time(44592123)
+ '12:23:12.123'
+ '''
+ it = int(ms / 1000)
+ ms = ms - it*1000
+ ss = it % 60
+ mm = ((it-ss)/60) % 60
+ hh = ((it-(mm*60)-ss)/3600) % 60
+ return "%d:%02d:%02d.%03d" % (hh, mm, ss, ms)
def time2ms(timeString):
- '''
- >>> time2ms('12:23:12.123')
- 44592123
- '''
- ms = 0.0
- p = timeString.split(':')
- for i in range(len(p)):
- ms = ms * 60 + float(p[i])
- return int(ms * 1000)
+ '''
+ >>> time2ms('12:23:12.123')
+ 44592123
+ '''
+ ms = 0.0
+ p = timeString.split(':')
+ for i in range(len(p)):
+ ms = ms * 60 + float(p[i])
+ return int(ms * 1000)
def shiftTime(offset, timeString):
- newTime = time2ms(timeString) + offset
- return ms2time(newTime)
+ newTime = time2ms(timeString) + offset
+ return ms2time(newTime)
diff --git a/oxutils/hashes.py b/oxutils/hashes.py
index 800c104..4d03684 100644
--- a/oxutils/hashes.py
+++ b/oxutils/hashes.py
@@ -1,17 +1,17 @@
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
# GPL written 2008 by j@pad.ma
import sha
import os
def sha1sum(filename):
- sha1 = sha.new()
- file=open(filename)
- buffer=file.read(4096)
- while buffer:
- sha1.update(buffer)
+ sha1 = sha.new()
+ file=open(filename)
buffer=file.read(4096)
- file.close()
- return sha1.hexdigest()
+ while buffer:
+ sha1.update(buffer)
+ buffer=file.read(4096)
+ file.close()
+ return sha1.hexdigest()
diff --git a/oxutils/html.py b/oxutils/html.py
index 52be5ff..afceafb 100644
--- a/oxutils/html.py
+++ b/oxutils/html.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
# GPL written 2008 by j@pad.ma
import re
import string
@@ -26,147 +26,147 @@ trailing_empty_content_re = re.compile(r'(?:
(?: |\s|
)*?
\s*)+\
del x # Temporary variable
def escape(html):
- '''
- Returns the given HTML with ampersands, quotes and carets encoded
+ '''
+ Returns the given HTML with ampersands, quotes and carets encoded
- >>> escape('html "test" & ')
- 'html "test" & <brothers>'
- '''
- if not isinstance(html, basestring):
- html = str(html)
- return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
+ >>> escape('html "test" & ')
+ 'html "test" & <brothers>'
+ '''
+ if not isinstance(html, basestring):
+ html = str(html)
+ return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
def linebreaks(value):
- '''
- Converts newlines into and
- '''
- value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
- paras = re.split('\n{2,}', value)
- paras = ['
%s
' % p.strip().replace('\n', '
') for p in paras]
- return '\n\n'.join(paras)
+ '''
+ Converts newlines into and
+ '''
+ value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
+ paras = re.split('\n{2,}', value)
+ paras = ['
%s
' % p.strip().replace('\n', '
') for p in paras]
+ return '\n\n'.join(paras)
def stripTags(value):
- """
- Returns the given HTML with all tags stripped
-
- >>> stripTags('some title
')
- 'some title asdfasdf'
- """
- return re.sub(r'<[^>]*?>', '', value)
+ """
+ Returns the given HTML with all tags stripped
+
+ >>> stripTags('some title
')
+ 'some title asdfasdf'
+ """
+ return re.sub(r'<[^>]*?>', '', value)
def stripSpacesBetweenTags(value):
- "Returns the given HTML with spaces between tags normalized to a single space"
- return re.sub(r'>\s+<', '> <', value)
+ "Returns the given HTML with spaces between tags normalized to a single space"
+ return re.sub(r'>\s+<', '> <', value)
def stripEntities(value):
- "Returns the given HTML with all entities (&something;) stripped"
- return re.sub(r'&(?:\w+|#\d);', '', value)
+ "Returns the given HTML with all entities (&something;) stripped"
+ return re.sub(r'&(?:\w+|#\d);', '', value)
def fixAmpersands(value):
- "Returns the given HTML with all unencoded ampersands encoded correctly"
- return unencoded_ampersands_re.sub('&', value)
+ "Returns the given HTML with all unencoded ampersands encoded correctly"
+ return unencoded_ampersands_re.sub('&', value)
def urlize(text, trim_url_limit=None, nofollow=False):
- """
- Converts any URLs in text into clickable links. Works on http://, https:// and
- www. links. Links can have trailing punctuation (periods, commas, close-parens)
- and leading punctuation (opening parens) and it'll still do the right thing.
+ """
+ Converts any URLs in text into clickable links. Works on http://, https:// and
+ www. links. Links can have trailing punctuation (periods, commas, close-parens)
+ and leading punctuation (opening parens) and it'll still do the right thing.
- If trim_url_limit is not None, the URLs in link text will be limited to
- trim_url_limit characters.
+ If trim_url_limit is not None, the URLs in link text will be limited to
+ trim_url_limit characters.
- If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
- """
- trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
- words = word_split_re.split(text)
- nofollow_attr = nofollow and ' rel="nofollow"' or ''
- for i, word in enumerate(words):
- match = punctuation_re.match(word)
- if match:
- lead, middle, trail = match.groups()
- if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
- len(middle) > 0 and middle[0] in string.letters + string.digits and \
- (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
- middle = '%s' % (middle, nofollow_attr, trim_url(middle))
- if middle.startswith('http://') or middle.startswith('https://'):
- middle = '%s' % (middle, nofollow_attr, trim_url(middle))
- if '@' in middle and not middle.startswith('www.') and not ':' in middle \
- and simple_email_re.match(middle):
- middle = '%s' % (middle, middle)
- if lead + middle + trail != word:
- words[i] = lead + middle + trail
- return ''.join(words)
+ If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
+ """
+ trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
+ words = word_split_re.split(text)
+ nofollow_attr = nofollow and ' rel="nofollow"' or ''
+ for i, word in enumerate(words):
+ match = punctuation_re.match(word)
+ if match:
+ lead, middle, trail = match.groups()
+ if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
+ len(middle) > 0 and middle[0] in string.letters + string.digits and \
+ (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
+ middle = '%s' % (middle, nofollow_attr, trim_url(middle))
+ if middle.startswith('http://') or middle.startswith('https://'):
+ middle = '%s' % (middle, nofollow_attr, trim_url(middle))
+ if '@' in middle and not middle.startswith('www.') and not ':' in middle \
+ and simple_email_re.match(middle):
+ middle = '%s' % (middle, middle)
+ if lead + middle + trail != word:
+ words[i] = lead + middle + trail
+ return ''.join(words)
def cleanHtml(text):
- """
- Cleans the given HTML. Specifically, it does the following:
- * Converts and to and .
- * Encodes all ampersands correctly.
- * Removes all "target" attributes from tags.
- * Removes extraneous HTML, such as presentational tags that open and
- immediately close and
.
- * Converts hard-coded bullets into HTML unordered lists.
- * Removes stuff like "
", but only if it's at the
- bottom of the text.
- """
- from text import normalizeNewlines
- text = normalizeNewlines(text)
- text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
- text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
- text = fixAmpersands(text)
- # Remove all target="" attributes from tags.
- text = link_target_attribute_re.sub('\\1', text)
- # Trim stupid HTML such as
.
- text = html_gunk_re.sub('', text)
- # Convert hard-coded bullets into HTML unordered lists.
- def replace_p_tags(match):
- s = match.group().replace('', '')
- for d in DOTS:
- s = s.replace('%s' % d, '
')
- return '' % s
- text = hard_coded_bullets_re.sub(replace_p_tags, text)
- # Remove stuff like "
", but only if it's at the bottom of the text.
- text = trailing_empty_content_re.sub('', text)
- return text
+ """
+ Cleans the given HTML. Specifically, it does the following:
+ * Converts and to and .
+ * Encodes all ampersands correctly.
+ * Removes all "target" attributes from tags.
+ * Removes extraneous HTML, such as presentational tags that open and
+ immediately close and
.
+ * Converts hard-coded bullets into HTML unordered lists.
+ * Removes stuff like "
", but only if it's at the
+ bottom of the text.
+ """
+ from text import normalizeNewlines
+ text = normalizeNewlines(text)
+ text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
+ text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
+ text = fixAmpersands(text)
+ # Remove all target="" attributes from tags.
+ text = link_target_attribute_re.sub('\\1', text)
+ # Trim stupid HTML such as
.
+ text = html_gunk_re.sub('', text)
+ # Convert hard-coded bullets into HTML unordered lists.
+ def replace_p_tags(match):
+ s = match.group().replace('', '')
+ for d in DOTS:
+ s = s.replace('%s' % d, '
')
+ return '' % s
+ text = hard_coded_bullets_re.sub(replace_p_tags, text)
+ # Remove stuff like "
", but only if it's at the bottom of the text.
+ text = trailing_empty_content_re.sub('', text)
+ return text
# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def decodeHtml(html):
- """
- >>> decodeHtml('me & you and $&%')
- u'me & you and $&%'
- """
- if type(html) != unicode:
- html = unicode(html)[:]
- if type(html) is unicode:
- uchr = unichr
- else:
- uchr = lambda value: value > 255 and unichr(value) or chr(value)
- def entitydecode(match, uchr=uchr):
- entity = match.group(1)
- if entity.startswith('#x'):
- return uchr(int(entity[2:], 16))
- elif entity.startswith('#'):
- return uchr(int(entity[1:]))
- elif entity in name2codepoint:
- return uchr(name2codepoint[entity])
+ """
+ >>> decodeHtml('me & you and $&%')
+ u'me & you and $&%'
+ """
+ if type(html) != unicode:
+ html = unicode(html)[:]
+ if type(html) is unicode:
+ uchr = unichr
else:
- return match.group(0)
- return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
+ uchr = lambda value: value > 255 and unichr(value) or chr(value)
+ def entitydecode(match, uchr=uchr):
+ entity = match.group(1)
+ if entity.startswith('#x'):
+ return uchr(int(entity[2:], 16))
+ elif entity.startswith('#'):
+ return uchr(int(entity[1:]))
+ elif entity in name2codepoint:
+ return uchr(name2codepoint[entity])
+ else:
+ return match.group(0)
+ return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
def highlight(text, query, hlClass="hl"):
- """
- >>> highlight('me & you and $&%', 'and')
- 'me & you and $&%'
- """
- if query:
- text = text.replace('
', '|')
- query = re.escape(query).replace('\ ', '.')
- m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
- for i in m:
- text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '\\1' % hlClass, text)
- text = text.replace('|', '
')
- return text
+ """
+ >>> highlight('me & you and $&%', 'and')
+ 'me & you and $&%'
+ """
+ if query:
+ text = text.replace('
', '|')
+ query = re.escape(query).replace('\ ', '.')
+ m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
+ for i in m:
+ text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '\\1' % hlClass, text)
+ text = text.replace('|', '
')
+ return text
diff --git a/oxutils/lang.py b/oxutils/lang.py
index 10d0637..964ac9f 100644
--- a/oxutils/lang.py
+++ b/oxutils/lang.py
@@ -1,236 +1,236 @@
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
_iso639_languages = [
- ("Unknown", "", "", "und"),
- ("Afar", "", "aa", "aar"),
- ("Abkhazian", "", "ab", "abk"),
- ("Afrikaans", "", "af", "afr"),
- ("Akan", "", "ak", "aka"),
- ("Albanian", "", "sq", "sqi"),
- ("Amharic", "", "am", "amh"),
- ("Arabic", "", "ar", "ara"),
- ("Aragonese", "", "an", "arg"),
- ("Armenian", "", "hy", "hye"),
- ("Assamese", "", "as", "asm"),
- ("Avaric", "", "av", "ava"),
- ("Avestan", "", "ae", "ave"),
- ("Aymara", "", "ay", "aym"),
- ("Azerbaijani", "", "az", "aze"),
- ("Bashkir", "", "ba", "bak"),
- ("Bambara", "", "bm", "bam"),
- ("Basque", "", "eu", "eus"),
- ("Belarusian", "", "be", "bel"),
- ("Bengali", "", "bn", "ben"),
- ("Bihari", "", "bh", "bih"),
- ("Bislama", "", "bi", "bis"),
- ("Bosnian", "", "bs", "bos"),
- ("Breton", "", "br", "bre"),
- ("Bulgarian", "", "bg", "bul"),
- ("Burmese", "", "my", "mya"),
- ("Catalan", "", "ca", "cat"),
- ("Chamorro", "", "ch", "cha"),
- ("Chechen", "", "ce", "che"),
- ("Chinese", "", "zh", "zho"),
- ("Church Slavic", "", "cu", "chu"),
- ("Chuvash", "", "cv", "chv"),
- ("Cornish", "", "kw", "cor"),
- ("Corsican", "", "co", "cos"),
- ("Cree", "", "cr", "cre"),
- ("Czech", "", "cs", "ces"),
- ("Danish", "Dansk", "da", "dan"),
- ("Divehi", "", "dv", "div"),
- ("Dutch", "Nederlands", "nl", "nld"),
- ("Dzongkha", "", "dz", "dzo"),
- ("English", "English", "en", "eng"),
- ("Esperanto", "", "eo", "epo"),
- ("Estonian", "", "et", "est"),
- ("Ewe", "", "ee", "ewe"),
- ("Faroese", "", "fo", "fao"),
- ("Fijian", "", "fj", "fij"),
- ("Finnish", "Suomi", "fi", "fin"),
- ("French", "Francais", "fr", "fra"),
- ("Western Frisian", "", "fy", "fry"),
- ("Fulah", "", "ff", "ful"),
- ("Georgian", "", "ka", "kat"),
- ("German", "Deutsch", "de", "deu"),
- ("Gaelic (Scots)", "", "gd", "gla"),
- ("Irish", "", "ga", "gle"),
- ("Galician", "", "gl", "glg"),
- ("Manx", "", "gv", "glv"),
- ("Greek, Modern", "", "el", "ell"),
- ("Guarani", "", "gn", "grn"),
- ("Gujarati", "", "gu", "guj"),
- ("Haitian", "", "ht", "hat"),
- ("Hausa", "", "ha", "hau"),
- ("Hebrew", "", "he", "heb"),
- ("Herero", "", "hz", "her"),
- ("Hindi", "", "hi", "hin"),
- ("Hiri Motu", "", "ho", "hmo"),
- ("Hungarian", "Magyar", "hu", "hun"),
- ("Igbo", "", "ig", "ibo"),
- ("Icelandic", "Islenska", "is", "isl"),
- ("Ido", "", "io", "ido"),
- ("Sichuan Yi", "", "ii", "iii"),
- ("Inuktitut", "", "iu", "iku"),
- ("Interlingue", "", "ie", "ile"),
- ("Interlingua", "", "ia", "ina"),
- ("Indonesian", "", "id", "ind"),
- ("Inupiaq", "", "ik", "ipk"),
- ("Italian", "Italiano", "it", "ita"),
- ("Javanese", "", "jv", "jav"),
- ("Japanese", "", "ja", "jpn"),
- ("Kalaallisut (Greenlandic)", "", "kl", "kal"),
- ("Kannada", "", "kn", "kan"),
- ("Kashmiri", "", "ks", "kas"),
- ("Kanuri", "", "kr", "kau"),
- ("Kazakh", "", "kk", "kaz"),
- ("Central Khmer", "", "km", "khm"),
- ("Kikuyu", "", "ki", "kik"),
- ("Kinyarwanda", "", "rw", "kin"),
- ("Kirghiz", "", "ky", "kir"),
- ("Komi", "", "kv", "kom"),
- ("Kongo", "", "kg", "kon"),
- ("Korean", "", "ko", "kor"),
- ("Kuanyama", "", "kj", "kua"),
- ("Kurdish", "", "ku", "kur"),
- ("Lao", "", "lo", "lao"),
- ("Latin", "", "la", "lat"),
- ("Latvian", "", "lv", "lav"),
- ("Limburgan", "", "li", "lim"),
- ("Lingala", "", "ln", "lin"),
- ("Lithuanian", "", "lt", "lit"),
- ("Luxembourgish", "", "lb", "ltz"),
- ("Luba-Katanga", "", "lu", "lub"),
- ("Ganda", "", "lg", "lug"),
- ("Macedonian", "", "mk", "mkd"),
- ("Marshallese", "", "mh", "mah"),
- ("Malayalam", "", "ml", "mal"),
- ("Maori", "", "mi", "mri"),
- ("Marathi", "", "mr", "mar"),
- ("Malay", "", "ms", "msa"),
- ("Malagasy", "", "mg", "mlg"),
- ("Maltese", "", "mt", "mlt"),
- ("Moldavian", "", "mo", "mol"),
- ("Mongolian", "", "mn", "mon"),
- ("Nauru", "", "na", "nau"),
- ("Navajo", "", "nv", "nav"),
- ("Ndebele, South", "", "nr", "nbl"),
- ("Ndebele, North", "", "nd", "nde"),
- ("Ndonga", "", "ng", "ndo"),
- ("Nepali", "", "ne", "nep"),
- ("Norwegian Nynorsk", "", "nn", "nno"),
- ("Norwegian Bokmål", "", "nb", "nob"),
- ("Norwegian", "Norsk", "no", "nor"),
- ("Chichewa; Nyanja", "", "ny", "nya"),
- ("Occitan (post 1500); Provençal", "", "oc", "oci"),
- ("Ojibwa", "", "oj", "oji"),
- ("Oriya", "", "or", "ori"),
- ("Oromo", "", "om", "orm"),
- ("Ossetian; Ossetic", "", "os", "oss"),
- ("Panjabi", "", "pa", "pan"),
- ("Persian", "", "fa", "fas"),
- ("Pali", "", "pi", "pli"),
- ("Polish", "", "pl", "pol"),
- ("Portuguese", "Portugues", "pt", "por"),
- ("Pushto", "", "ps", "pus"),
- ("Quechua", "", "qu", "que"),
- ("Romansh", "", "rm", "roh"),
- ("Romanian", "", "ro", "ron"),
- ("Rundi", "", "rn", "run"),
- ("Russian", "", "ru", "rus"),
- ("Sango", "", "sg", "sag"),
- ("Sanskrit", "", "sa", "san"),
- ("Serbian", "", "sr", "srp"),
- ("Croatian", "Hrvatski", "hr", "hrv"),
- ("Sinhala", "", "si", "sin"),
- ("Slovak", "", "sk", "slk"),
- ("Slovenian", "", "sl", "slv"),
- ("Northern Sami", "", "se", "sme"),
- ("Samoan", "", "sm", "smo"),
- ("Shona", "", "sn", "sna"),
- ("Sindhi", "", "sd", "snd"),
- ("Somali", "", "so", "som"),
- ("Sotho, Southern", "", "st", "sot"),
- ("Spanish", "Espanol", "es", "spa"),
- ("Sardinian", "", "sc", "srd"),
- ("Swati", "", "ss", "ssw"),
- ("Sundanese", "", "su", "sun"),
- ("Swahili", "", "sw", "swa"),
- ("Swedish", "Svenska", "sv", "swe"),
- ("Tahitian", "", "ty", "tah"),
- ("Tamil", "", "ta", "tam"),
- ("Tatar", "", "tt", "tat"),
- ("Telugu", "", "te", "tel"),
- ("Tajik", "", "tg", "tgk"),
- ("Tagalog", "", "tl", "tgl"),
- ("Thai", "", "th", "tha"),
- ("Tibetan", "", "bo", "bod"),
- ("Tigrinya", "", "ti", "tir"),
- ("Tonga (Tonga Islands)", "", "to", "ton"),
- ("Tswana", "", "tn", "tsn"),
- ("Tsonga", "", "ts", "tso"),
- ("Turkmen", "", "tk", "tuk"),
- ("Turkish", "", "tr", "tur"),
- ("Twi", "", "tw", "twi"),
- ("Uighur", "", "ug", "uig"),
- ("Ukrainian", "", "uk", "ukr"),
- ("Urdu", "", "ur", "urd"),
- ("Uzbek", "", "uz", "uzb"),
- ("Venda", "", "ve", "ven"),
- ("Vietnamese", "", "vi", "vie"),
- ("Volapük", "", "vo", "vol"),
- ("Welsh", "", "cy", "cym"),
- ("Walloon", "", "wa", "wln"),
- ("Wolof", "", "wo", "wol"),
- ("Xhosa", "", "xh", "xho"),
- ("Yiddish", "", "yi", "yid"),
- ("Yoruba", "", "yo", "yor"),
- ("Zhuang", "", "za", "zha"),
- ("Zulu", "", "zu", "zul"),
+ ("Unknown", "", "", "und"),
+ ("Afar", "", "aa", "aar"),
+ ("Abkhazian", "", "ab", "abk"),
+ ("Afrikaans", "", "af", "afr"),
+ ("Akan", "", "ak", "aka"),
+ ("Albanian", "", "sq", "sqi"),
+ ("Amharic", "", "am", "amh"),
+ ("Arabic", "", "ar", "ara"),
+ ("Aragonese", "", "an", "arg"),
+ ("Armenian", "", "hy", "hye"),
+ ("Assamese", "", "as", "asm"),
+ ("Avaric", "", "av", "ava"),
+ ("Avestan", "", "ae", "ave"),
+ ("Aymara", "", "ay", "aym"),
+ ("Azerbaijani", "", "az", "aze"),
+ ("Bashkir", "", "ba", "bak"),
+ ("Bambara", "", "bm", "bam"),
+ ("Basque", "", "eu", "eus"),
+ ("Belarusian", "", "be", "bel"),
+ ("Bengali", "", "bn", "ben"),
+ ("Bihari", "", "bh", "bih"),
+ ("Bislama", "", "bi", "bis"),
+ ("Bosnian", "", "bs", "bos"),
+ ("Breton", "", "br", "bre"),
+ ("Bulgarian", "", "bg", "bul"),
+ ("Burmese", "", "my", "mya"),
+ ("Catalan", "", "ca", "cat"),
+ ("Chamorro", "", "ch", "cha"),
+ ("Chechen", "", "ce", "che"),
+ ("Chinese", "", "zh", "zho"),
+ ("Church Slavic", "", "cu", "chu"),
+ ("Chuvash", "", "cv", "chv"),
+ ("Cornish", "", "kw", "cor"),
+ ("Corsican", "", "co", "cos"),
+ ("Cree", "", "cr", "cre"),
+ ("Czech", "", "cs", "ces"),
+ ("Danish", "Dansk", "da", "dan"),
+ ("Divehi", "", "dv", "div"),
+ ("Dutch", "Nederlands", "nl", "nld"),
+ ("Dzongkha", "", "dz", "dzo"),
+ ("English", "English", "en", "eng"),
+ ("Esperanto", "", "eo", "epo"),
+ ("Estonian", "", "et", "est"),
+ ("Ewe", "", "ee", "ewe"),
+ ("Faroese", "", "fo", "fao"),
+ ("Fijian", "", "fj", "fij"),
+ ("Finnish", "Suomi", "fi", "fin"),
+ ("French", "Francais", "fr", "fra"),
+ ("Western Frisian", "", "fy", "fry"),
+ ("Fulah", "", "ff", "ful"),
+ ("Georgian", "", "ka", "kat"),
+ ("German", "Deutsch", "de", "deu"),
+ ("Gaelic (Scots)", "", "gd", "gla"),
+ ("Irish", "", "ga", "gle"),
+ ("Galician", "", "gl", "glg"),
+ ("Manx", "", "gv", "glv"),
+ ("Greek, Modern", "", "el", "ell"),
+ ("Guarani", "", "gn", "grn"),
+ ("Gujarati", "", "gu", "guj"),
+ ("Haitian", "", "ht", "hat"),
+ ("Hausa", "", "ha", "hau"),
+ ("Hebrew", "", "he", "heb"),
+ ("Herero", "", "hz", "her"),
+ ("Hindi", "", "hi", "hin"),
+ ("Hiri Motu", "", "ho", "hmo"),
+ ("Hungarian", "Magyar", "hu", "hun"),
+ ("Igbo", "", "ig", "ibo"),
+ ("Icelandic", "Islenska", "is", "isl"),
+ ("Ido", "", "io", "ido"),
+ ("Sichuan Yi", "", "ii", "iii"),
+ ("Inuktitut", "", "iu", "iku"),
+ ("Interlingue", "", "ie", "ile"),
+ ("Interlingua", "", "ia", "ina"),
+ ("Indonesian", "", "id", "ind"),
+ ("Inupiaq", "", "ik", "ipk"),
+ ("Italian", "Italiano", "it", "ita"),
+ ("Javanese", "", "jv", "jav"),
+ ("Japanese", "", "ja", "jpn"),
+ ("Kalaallisut (Greenlandic)", "", "kl", "kal"),
+ ("Kannada", "", "kn", "kan"),
+ ("Kashmiri", "", "ks", "kas"),
+ ("Kanuri", "", "kr", "kau"),
+ ("Kazakh", "", "kk", "kaz"),
+ ("Central Khmer", "", "km", "khm"),
+ ("Kikuyu", "", "ki", "kik"),
+ ("Kinyarwanda", "", "rw", "kin"),
+ ("Kirghiz", "", "ky", "kir"),
+ ("Komi", "", "kv", "kom"),
+ ("Kongo", "", "kg", "kon"),
+ ("Korean", "", "ko", "kor"),
+ ("Kuanyama", "", "kj", "kua"),
+ ("Kurdish", "", "ku", "kur"),
+ ("Lao", "", "lo", "lao"),
+ ("Latin", "", "la", "lat"),
+ ("Latvian", "", "lv", "lav"),
+ ("Limburgan", "", "li", "lim"),
+ ("Lingala", "", "ln", "lin"),
+ ("Lithuanian", "", "lt", "lit"),
+ ("Luxembourgish", "", "lb", "ltz"),
+ ("Luba-Katanga", "", "lu", "lub"),
+ ("Ganda", "", "lg", "lug"),
+ ("Macedonian", "", "mk", "mkd"),
+ ("Marshallese", "", "mh", "mah"),
+ ("Malayalam", "", "ml", "mal"),
+ ("Maori", "", "mi", "mri"),
+ ("Marathi", "", "mr", "mar"),
+ ("Malay", "", "ms", "msa"),
+ ("Malagasy", "", "mg", "mlg"),
+ ("Maltese", "", "mt", "mlt"),
+ ("Moldavian", "", "mo", "mol"),
+ ("Mongolian", "", "mn", "mon"),
+ ("Nauru", "", "na", "nau"),
+ ("Navajo", "", "nv", "nav"),
+ ("Ndebele, South", "", "nr", "nbl"),
+ ("Ndebele, North", "", "nd", "nde"),
+ ("Ndonga", "", "ng", "ndo"),
+ ("Nepali", "", "ne", "nep"),
+ ("Norwegian Nynorsk", "", "nn", "nno"),
+ ("Norwegian Bokmål", "", "nb", "nob"),
+ ("Norwegian", "Norsk", "no", "nor"),
+ ("Chichewa; Nyanja", "", "ny", "nya"),
+ ("Occitan (post 1500); Provençal", "", "oc", "oci"),
+ ("Ojibwa", "", "oj", "oji"),
+ ("Oriya", "", "or", "ori"),
+ ("Oromo", "", "om", "orm"),
+ ("Ossetian; Ossetic", "", "os", "oss"),
+ ("Panjabi", "", "pa", "pan"),
+ ("Persian", "", "fa", "fas"),
+ ("Pali", "", "pi", "pli"),
+ ("Polish", "", "pl", "pol"),
+ ("Portuguese", "Portugues", "pt", "por"),
+ ("Pushto", "", "ps", "pus"),
+ ("Quechua", "", "qu", "que"),
+ ("Romansh", "", "rm", "roh"),
+ ("Romanian", "", "ro", "ron"),
+ ("Rundi", "", "rn", "run"),
+ ("Russian", "", "ru", "rus"),
+ ("Sango", "", "sg", "sag"),
+ ("Sanskrit", "", "sa", "san"),
+ ("Serbian", "", "sr", "srp"),
+ ("Croatian", "Hrvatski", "hr", "hrv"),
+ ("Sinhala", "", "si", "sin"),
+ ("Slovak", "", "sk", "slk"),
+ ("Slovenian", "", "sl", "slv"),
+ ("Northern Sami", "", "se", "sme"),
+ ("Samoan", "", "sm", "smo"),
+ ("Shona", "", "sn", "sna"),
+ ("Sindhi", "", "sd", "snd"),
+ ("Somali", "", "so", "som"),
+ ("Sotho, Southern", "", "st", "sot"),
+ ("Spanish", "Espanol", "es", "spa"),
+ ("Sardinian", "", "sc", "srd"),
+ ("Swati", "", "ss", "ssw"),
+ ("Sundanese", "", "su", "sun"),
+ ("Swahili", "", "sw", "swa"),
+ ("Swedish", "Svenska", "sv", "swe"),
+ ("Tahitian", "", "ty", "tah"),
+ ("Tamil", "", "ta", "tam"),
+ ("Tatar", "", "tt", "tat"),
+ ("Telugu", "", "te", "tel"),
+ ("Tajik", "", "tg", "tgk"),
+ ("Tagalog", "", "tl", "tgl"),
+ ("Thai", "", "th", "tha"),
+ ("Tibetan", "", "bo", "bod"),
+ ("Tigrinya", "", "ti", "tir"),
+ ("Tonga (Tonga Islands)", "", "to", "ton"),
+ ("Tswana", "", "tn", "tsn"),
+ ("Tsonga", "", "ts", "tso"),
+ ("Turkmen", "", "tk", "tuk"),
+ ("Turkish", "", "tr", "tur"),
+ ("Twi", "", "tw", "twi"),
+ ("Uighur", "", "ug", "uig"),
+ ("Ukrainian", "", "uk", "ukr"),
+ ("Urdu", "", "ur", "urd"),
+ ("Uzbek", "", "uz", "uzb"),
+ ("Venda", "", "ve", "ven"),
+ ("Vietnamese", "", "vi", "vie"),
+ ("Volapük", "", "vo", "vol"),
+ ("Welsh", "", "cy", "cym"),
+ ("Walloon", "", "wa", "wln"),
+ ("Wolof", "", "wo", "wol"),
+ ("Xhosa", "", "xh", "xho"),
+ ("Yiddish", "", "yi", "yid"),
+ ("Yoruba", "", "yo", "yor"),
+ ("Zhuang", "", "za", "zha"),
+ ("Zulu", "", "zu", "zul"),
]
def codeToLang(code):
- code = code.lower()
- if len(code) == 2:
- for l in _iso639_languages:
- if l[2] == code:
- return l[0]
- elif len(code) == 3:
- for l in _iso639_languages:
- if l[3] == code:
- return l[0]
- return None
+ code = code.lower()
+ if len(code) == 2:
+ for l in _iso639_languages:
+ if l[2] == code:
+ return l[0]
+ elif len(code) == 3:
+ for l in _iso639_languages:
+ if l[3] == code:
+ return l[0]
+ return None
def langTo3Code(lang):
- lang = englishName(lang)
- if lang:
- lang=lang.lower()
- for l in _iso639_languages:
- if l[0].lower() == lang:
- return l[3]
- return None
+ lang = englishName(lang)
+ if lang:
+ lang=lang.lower()
+ for l in _iso639_languages:
+ if l[0].lower() == lang:
+ return l[3]
+ return None
def langTo2Code(lang):
- lang = englishName(lang)
- if lang:
- lang=lang.lower()
- for l in _iso639_languages:
- if l[0].lower() == lang:
- return l[2]
- return None
+ lang = englishName(lang)
+ if lang:
+ lang=lang.lower()
+ for l in _iso639_languages:
+ if l[0].lower() == lang:
+ return l[2]
+ return None
def langCode2To3(code):
- langTo3Code(codeToLang(code))
+ langTo3Code(codeToLang(code))
def langCode3To2(code):
- langTo2Code(codeToLang(code))
+ langTo2Code(codeToLang(code))
def englishName(lang):
- lang = lang.lower()
- for l in _iso639_languages:
- if l[1].lower() == lang:
- return l[0]
- return None
+ lang = lang.lower()
+ for l in _iso639_languages:
+ if l[1].lower() == lang:
+ return l[0]
+ return None
diff --git a/oxutils/net.py b/oxutils/net.py
index 8c57151..f07b8d0 100644
--- a/oxutils/net.py
+++ b/oxutils/net.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
import gzip
import StringIO
import urllib
@@ -10,64 +10,64 @@ from chardet.universaldetector import UniversalDetector
# Default headers for HTTP requests.
DEFAULT_HEADERS = {
- 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9) Gecko/2008061015 Firefox/3.0',
- 'Accept-Encoding': 'gzip'
+ 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9) Gecko/2008061015 Firefox/3.0',
+ 'Accept-Encoding': 'gzip'
}
def status(url, data=None, headers=DEFAULT_HEADERS):
- try:
- f = openUrl(url, data, headers)
- s = f.code
- except urllib2.HTTPError, e:
- s = e.code
- return s
+ try:
+ f = openUrl(url, data, headers)
+ s = f.code
+ except urllib2.HTTPError, e:
+ s = e.code
+ return s
def exists(url, data=None, headers=DEFAULT_HEADERS):
- s = status(url, data, headers)
- if s >= 200 and s < 400:
- return True
- return False
+ s = status(url, data, headers)
+ if s >= 200 and s < 400:
+ return True
+ return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
- try:
- f = openUrl(url, data, headers)
- f.headers['Status'] = "%s" % f.code
- headers = f.headers
- f.close()
- except urllib2.HTTPError, e:
- e.headers['Status'] = "%s" % e.code
- headers = e.headers
- return dict(headers)
+ try:
+ f = openUrl(url, data, headers)
+ f.headers['Status'] = "%s" % f.code
+ headers = f.headers
+ f.close()
+ except urllib2.HTTPError, e:
+ e.headers['Status'] = "%s" % e.code
+ headers = e.headers
+ return dict(headers)
def openUrl(url, data=None, headers=DEFAULT_HEADERS):
- url = url.replace(' ', '%20')
- req = urllib2.Request(url, data, headers)
- return urllib2.urlopen(req)
+ url = url.replace(' ', '%20')
+ req = urllib2.Request(url, data, headers)
+ return urllib2.urlopen(req)
def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
- f = openUrl(url, data, headers)
- data = f.read()
- f.close()
- if f.headers.get('content-encoding', None) == 'gzip':
- data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
- if returnHeaders:
- f.headers['Status'] = "%s" % f.code
- return dict(f.headers), data
- return data
+ f = openUrl(url, data, headers)
+ data = f.read()
+ f.close()
+ if f.headers.get('content-encoding', None) == 'gzip':
+ data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
+ if returnHeaders:
+ f.headers['Status'] = "%s" % f.code
+ return dict(f.headers), data
+ return data
def getUrlUnicode(url):
- data = getUrl(url)
- encoding = getEncoding(data)
- if not encoding:
- encoding = 'latin-1'
- return unicode(data, encoding)
+ data = getUrl(url)
+ encoding = getEncoding(data)
+ if not encoding:
+ encoding = 'latin-1'
+ return unicode(data, encoding)
def getEncoding(data):
- detector = UniversalDetector()
- for line in data.split('\n'):
- detector.feed(line)
- if detector.done:
- break
- detector.close()
- return detector.result['encoding']
+ detector = UniversalDetector()
+ for line in data.split('\n'):
+ detector.feed(line)
+ if detector.done:
+ break
+ detector.close()
+ return detector.result['encoding']
diff --git a/oxutils/normalize.py b/oxutils/normalize.py
index 9d63139..4df4405 100644
--- a/oxutils/normalize.py
+++ b/oxutils/normalize.py
@@ -1,79 +1,79 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
import re
_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
- "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
- 'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
- 'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
- 'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
- u'\xd4\xef', u'\xcf\xe9')
+ "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
+ 'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
+ 'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
+ 'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
+ u'\xd4\xef', u'\xcf\xe9')
# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
- if article[-1] not in ("'", '-'): article += ' '
- _spArticles.append(article)
+ if article[-1] not in ("'", '-'): article += ' '
+ _spArticles.append(article)
def canonicalTitle(title):
- """Return the title in the canonic format 'Movie Title, The'.
-
- >>> canonicalTitle('The Movie Title')
- 'Movie Title, The'
- """
- try:
- if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
- except IndexError: pass
- ltitle = title.lower()
- for article in _spArticles:
- if ltitle.startswith(article):
- lart = len(article)
- title = '%s, %s' % (title[lart:], title[:lart])
- if article[-1] == ' ': title = title[:-1]
- break
- ## XXX: an attempt using a dictionary lookup.
- ##for artSeparator in (' ', "'", '-'):
- ## article = _articlesDict.get(ltitle.split(artSeparator)[0])
- ## if article is not None:
- ## lart = len(article)
- ## # check titles like "una", "I'm Mad" and "L'abbacchio".
- ## if title[lart:] == '' or (artSeparator != ' ' and
- ## title[lart:][1] != artSeparator): continue
- ## title = '%s, %s' % (title[lart:], title[:lart])
- ## if artSeparator == ' ': title = title[1:]
- ## break
- return title
+ """Return the title in the canonic format 'Movie Title, The'.
+
+ >>> canonicalTitle('The Movie Title')
+ 'Movie Title, The'
+ """
+ try:
+ if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
+ except IndexError: pass
+ ltitle = title.lower()
+ for article in _spArticles:
+ if ltitle.startswith(article):
+ lart = len(article)
+ title = '%s, %s' % (title[lart:], title[:lart])
+ if article[-1] == ' ': title = title[:-1]
+ break
+ ## XXX: an attempt using a dictionary lookup.
+ ##for artSeparator in (' ', "'", '-'):
+ ## article = _articlesDict.get(ltitle.split(artSeparator)[0])
+ ## if article is not None:
+ ## lart = len(article)
+ ## # check titles like "una", "I'm Mad" and "L'abbacchio".
+ ## if title[lart:] == '' or (artSeparator != ' ' and
+ ## title[lart:][1] != artSeparator): continue
+ ## title = '%s, %s' % (title[lart:], title[:lart])
+ ## if artSeparator == ' ': title = title[1:]
+ ## break
+ return title
def normalizeTitle(title):
- """Return the title in the normal "The Title" format.
+ """Return the title in the normal "The Title" format.
- >>> normalizeTitle('Movie Title, The')
- 'The Movie Title'
- """
- stitle = title.split(', ')
- if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
- sep = ' '
- if stitle[-1][-1] in ("'", '-'): sep = ''
- title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
- return title
+ >>> normalizeTitle('Movie Title, The')
+ 'The Movie Title'
+ """
+ stitle = title.split(', ')
+ if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
+ sep = ' '
+ if stitle[-1][-1] in ("'", '-'): sep = ''
+ title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
+ return title
def normalizeImdbId(imdbId):
- """Return 7 digit imdbId.
+ """Return 7 digit imdbId.
- >>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
- '0159206'
- >>> normalizeImdbId(159206)
- '0159206'
- >>> normalizeImdbId('tt0159206')
- '0159206'
- """
- if isinstance(imdbId, basestring):
- imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
- elif isinstance(imdbId, int):
- imdbId = "%07d" % imdbId
- return imdbId
+ >>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
+ '0159206'
+ >>> normalizeImdbId(159206)
+ '0159206'
+ >>> normalizeImdbId('tt0159206')
+ '0159206'
+ """
+ if isinstance(imdbId, basestring):
+ imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
+ elif isinstance(imdbId, int):
+ imdbId = "%07d" % imdbId
+ return imdbId
# Common suffixes in surnames.
diff --git a/oxutils/text.py b/oxutils/text.py
index a7fa7b5..768db8b 100644
--- a/oxutils/text.py
+++ b/oxutils/text.py
@@ -1,216 +1,216 @@
# -*- coding: utf-8 -*-
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
# GPL written 2008 by j@pad.ma
import re
def findRe(string, regexp):
- result = re.compile(regexp, re.DOTALL).findall(string)
- if result:
- return result[0].strip()
- return ''
+ result = re.compile(regexp, re.DOTALL).findall(string)
+ if result:
+ return result[0].strip()
+ return ''
def findString(string, string0='', string1 = ''):
- """Return the string between string0 and string1.
+ """Return the string between string0 and string1.
- If string0 or string1 is left out, begining or end of string is used.
+ If string0 or string1 is left out, begining or end of string is used.
- >>> findString('i am not there', string1=' not there')
- 'i am'
+ >>> findString('i am not there', string1=' not there')
+ 'i am'
- >>> findString('i am not there', 'i am ', ' there')
- 'not'
+ >>> findString('i am not there', 'i am ', ' there')
+ 'not'
- >>> findString('i am not there', 'i am not t')
- 'here'
+ >>> findString('i am not there', 'i am not t')
+ 'here'
- """
- if string0:
- string0 = re.escape(string0)
- else:
- string0 = '^'
- if string1:
- string1 = re.escape(string1)
- else:
- string1 = '$'
- return findRegexp(string, string0 + '(.*?)' + string1)
+ """
+ if string0:
+ string0 = re.escape(string0)
+ else:
+ string0 = '^'
+ if string1:
+ string1 = re.escape(string1)
+ else:
+ string1 = '$'
+ return findRegexp(string, string0 + '(.*?)' + string1)
# Capitalizes the first letter of a string.
capfirst = lambda x: x and x[0].upper() + x[1:]
def removeSpecialCharacters(text):
- """
- Removes special characters inserted by Word.
- """
- text = text.replace(u'\u2013', '-')
- text = text.replace(u'\u2026O', "'")
- text = text.replace(u'\u2019', "'")
- text = text.replace(u'', "'")
- text = text.replace(u'', "'")
- text = text.replace(u'', "-")
- return text
+ """
+ Removes special characters inserted by Word.
+ """
+ text = text.replace(u'\u2013', '-')
+ text = text.replace(u'\u2026O', "'")
+ text = text.replace(u'\u2019', "'")
+ text = text.replace(u'', "'")
+ text = text.replace(u'', "'")
+ text = text.replace(u'', "-")
+ return text
def wrap(text, width):
- """
- A word-wrap function that preserves existing line breaks and most spaces in
- the text. Expects that existing line breaks are posix newlines (\n).
- See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
- """
- return reduce(lambda line, word, width=width: '%s%s%s' %
- (line,
- ' \n'[(len(line[line.rfind('\n')+1:])
- + len(word.split('\n',1)[0]
- ) >= width)],
- word),
- text.split(' ')
- )
+ """
+ A word-wrap function that preserves existing line breaks and most spaces in
+ the text. Expects that existing line breaks are posix newlines (\n).
+ See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
+ """
+ return reduce(lambda line, word, width=width: '%s%s%s' %
+ (line,
+ ' \n'[(len(line[line.rfind('\n')+1:])
+ + len(word.split('\n',1)[0]
+ ) >= width)],
+ word),
+ text.split(' ')
+ )
def truncateString(s, num):
- """Truncates a string after a certain number of chacters, but ends with a word
+ """Truncates a string after a certain number of chacters, but ends with a word
- >>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23)
- 'Truncates a string...'
- >>> truncateString('Truncates a string', 23)
- 'Truncates a string'
+ >>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23)
+ 'Truncates a string...'
+ >>> truncateString('Truncates a string', 23)
+ 'Truncates a string'
- """
- length = int(num)
- if len(s) <= length:
- return s
- words = s.split()
- ts = ""
- while words and len(ts) + len(words[0]) < length:
- ts += " " + words.pop(0)
- if words:
- ts += "..."
- return ts.strip()
+ """
+ length = int(num)
+ if len(s) <= length:
+ return s
+ words = s.split()
+ ts = ""
+ while words and len(ts) + len(words[0]) < length:
+ ts += " " + words.pop(0)
+ if words:
+ ts += "..."
+ return ts.strip()
def trimString(string, num):
- """Truncates a string after a certain number of chacters, adding ... at -10 characters
+ """Truncates a string after a certain number of chacters, adding ... at -10 characters
- >>> trimString('Truncates a string after a certain number of chacters', 23)
- 'Truncates ...f chacters'
- >>> trimString('Truncates a string', 23)
- 'Truncates a string'
- """
- if len(string) > num:
- string = string[:num - 13] + '...' + string[-10:]
- return string
+ >>> trimString('Truncates a string after a certain number of chacters', 23)
+ 'Truncates ...f chacters'
+ >>> trimString('Truncates a string', 23)
+ 'Truncates a string'
+ """
+ if len(string) > num:
+ string = string[:num - 13] + '...' + string[-10:]
+ return string
def truncateWords(s, num):
- "Truncates a string after a certain number of words."
- length = int(num)
- words = s.split()
- if len(words) > length:
- words = words[:length]
- if not words[-1].endswith('...'):
- words.append('...')
- return ' '.join(words)
+ "Truncates a string after a certain number of words."
+ length = int(num)
+ words = s.split()
+ if len(words) > length:
+ words = words[:length]
+ if not words[-1].endswith('...'):
+ words.append('...')
+ return ' '.join(words)
def getValidFilename(s):
- """
- Returns the given string converted to a string that can be used for a clean
- filename. Specifically, leading and trailing spaces are removed;
- all non-filename-safe characters are removed.
+ """
+ Returns the given string converted to a string that can be used for a clean
+ filename. Specifically, leading and trailing spaces are removed;
+ all non-filename-safe characters are removed.
- >>> getValidFilename("john's portrait in 2004.jpg")
- 'john_s_portrait_in_2004.jpg'
- """
- s = s.strip()
- s = s.replace(' ', '_')
- s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
- s = s.replace('__', '_').replace('__', '_')
- return s
+ >>> getValidFilename("john's portrait in 2004.jpg")
+ 'john_s_portrait_in_2004.jpg'
+ """
+ s = s.strip()
+ s = s.replace(' ', '_')
+ s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
+ s = s.replace('__', '_').replace('__', '_')
+ return s
def getTextList(list_, last_word='or'):
- """
- >>> getTextList(['a', 'b', 'c', 'd'])
- 'a, b, c or d'
- >>> getTextList(['a', 'b', 'c'], 'and')
- 'a, b and c'
- >>> getTextList(['a', 'b'], 'and')
- 'a and b'
- >>> getTextList(['a'])
- 'a'
- >>> getTextList([])
- ''
- """
- if len(list_) == 0: return ''
- if len(list_) == 1: return list_[0]
- return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1])
+ """
+ >>> getTextList(['a', 'b', 'c', 'd'])
+ 'a, b, c or d'
+ >>> getTextList(['a', 'b', 'c'], 'and')
+ 'a, b and c'
+ >>> getTextList(['a', 'b'], 'and')
+ 'a and b'
+ >>> getTextList(['a'])
+ 'a'
+ >>> getTextList([])
+ ''
+ """
+ if len(list_) == 0: return ''
+ if len(list_) == 1: return list_[0]
+ return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1])
def getListText(text, last_word='or'):
- """
- >>> getListText('a, b, c or d')
- ['a', 'b', 'c', 'd']
- >>> getListText('a, b and c', 'and')
- ['a', 'b', 'c']
- >>> getListText('a and b', 'and')
- ['a', 'b']
- >>> getListText('a')
- ['a']
- >>> getListText('')
- []
- """
- list_ = []
- if text:
- list_ = text.split(', ')
- if list_:
- i=len(list_)-1
- last = list_[i].split(last_word)
- if len(last) == 2:
- list_[i] = last[0].strip()
- list_.append(last[1].strip())
- return list_
+ """
+ >>> getListText('a, b, c or d')
+ ['a', 'b', 'c', 'd']
+ >>> getListText('a, b and c', 'and')
+ ['a', 'b', 'c']
+ >>> getListText('a and b', 'and')
+ ['a', 'b']
+ >>> getListText('a')
+ ['a']
+ >>> getListText('')
+ []
+ """
+ list_ = []
+ if text:
+ list_ = text.split(', ')
+ if list_:
+ i=len(list_)-1
+ last = list_[i].split(last_word)
+ if len(last) == 2:
+ list_[i] = last[0].strip()
+ list_.append(last[1].strip())
+ return list_
def normalizeNewlines(text):
- return re.sub(r'\r\n|\r|\n', '\n', text)
+ return re.sub(r'\r\n|\r|\n', '\n', text)
def recapitalize(text):
- "Recapitalizes text, placing caps after end-of-sentence punctuation."
-# capwords = ()
- text = text.lower()
- capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
- text = capsRE.sub(lambda x: x.group(1).upper(), text)
-# for capword in capwords:
-# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
-# text = capwordRE.sub(capword, text)
- return text
+ "Recapitalizes text, placing caps after end-of-sentence punctuation."
+ #capwords = ()
+ text = text.lower()
+ capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
+ text = capsRE.sub(lambda x: x.group(1).upper(), text)
+ #for capword in capwords:
+ # capwordRE = re.compile(r'\b%s\b' % capword, re.I)
+ # text = capwordRE.sub(capword, text)
+ return text
def phone2numeric(phone):
- "Converts a phone number with letters into its numeric equivalent."
- letters = re.compile(r'[A-PR-Y]', re.I)
- char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
- 'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
- 'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
- 's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
- 'y': '9', 'x': '9'}.get(m.group(0).lower())
- return letters.sub(char2number, phone)
+ "Converts a phone number with letters into its numeric equivalent."
+ letters = re.compile(r'[A-PR-Y]', re.I)
+ char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
+ 'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
+ 'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
+ 's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
+ 'y': '9', 'x': '9'}.get(m.group(0).lower())
+ return letters.sub(char2number, phone)
def compressString(s):
- import cStringIO, gzip
- zbuf = cStringIO.StringIO()
- zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
- zfile.write(s)
- zfile.close()
- return zbuf.getvalue()
+ import cStringIO, gzip
+ zbuf = cStringIO.StringIO()
+ zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
+ zfile.write(s)
+ zfile.close()
+ return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smartSplit(text):
- """
- Generator that splits a string by spaces, leaving quoted phrases together.
- Supports both single and double quotes, and supports escaping quotes with
- backslashes. In the output, strings will keep their initial and trailing
- quote marks.
- >>> list(smartSplit('This is "a person\\'s" test.'))
- ['This', 'is', '"a person\\'s"', 'test.']
- """
- for bit in smart_split_re.finditer(text):
- bit = bit.group(0)
- if bit[0] == '"':
- yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
- elif bit[0] == "'":
- yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
- else:
- yield bit
+ """
+ Generator that splits a string by spaces, leaving quoted phrases together.
+ Supports both single and double quotes, and supports escaping quotes with
+ backslashes. In the output, strings will keep their initial and trailing
+ quote marks.
+ >>> list(smartSplit('This is "a person\\'s" test.'))
+ ['This', 'is', '"a person\\'s"', 'test.']
+ """
+ for bit in smart_split_re.finditer(text):
+ bit = bit.group(0)
+ if bit[0] == '"':
+ yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
+ elif bit[0] == "'":
+ yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
+ else:
+ yield bit
diff --git a/oxutils/torrent.py b/oxutils/torrent.py
index fe452fa..6f53d5b 100644
--- a/oxutils/torrent.py
+++ b/oxutils/torrent.py
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
# Written 2007 by j@mailb.org
from threading import Event
@@ -11,50 +12,50 @@ from BitTornado.bencode import bencode, bdecode
def createTorrent(file, url, params = {}, flag = Event(),
progress = lambda x: None, progress_percent = 1):
- "Creates a torrent for a given file, using url as tracker url"
- return make_meta_file(file, url, params, flag, progress, progress_percent)
+ "Creates a torrent for a given file, using url as tracker url"
+ return make_meta_file(file, url, params, flag, progress, progress_percent)
def getInfoHash(torrentFile):
- "Returns Torrent Info Hash from torrent file"
- metainfo_file = open(torrentFile, 'rb')
- metainfo = bdecode(metainfo_file.read())
- info = metainfo['info']
- return sha.sha(bencode(info)).hexdigest().upper()
+ "Returns Torrent Info Hash from torrent file"
+ metainfo_file = open(torrentFile, 'rb')
+ metainfo = bdecode(metainfo_file.read())
+ info = metainfo['info']
+ return sha.sha(bencode(info)).hexdigest().upper()
def getTorrentInfoFromFile(torrentFile):
- f = open(torrentFile, 'rb')
- data = f.read()
- f.close()
- tinfo = getTorrentInfo(data)
- tinfo['timestamp'] = stat(torrentFile).st_ctime
- return tinfo
+ f = open(torrentFile, 'rb')
+ data = f.read()
+ f.close()
+ tinfo = getTorrentInfo(data)
+ tinfo['timestamp'] = stat(torrentFile).st_ctime
+ return tinfo
def getTorrentInfo(data):
- "Returns Torrent Info from torrent file"
- tinfo = {}
- metainfo = bdecode(data)
- info = metainfo['info']
- piece_length = info['piece length']
- if info.has_key('length'):
- # let's assume we just have one file
- file_length = info['length']
- else:
- # let's assume we have a directory structure
- file_length = 0;
- for f in info['files']:
- file_length += f['length']
- for key in info:
- if key != 'pieces':
- tinfo[key] = info[key]
- for key in metainfo:
- if key != 'info':
- tinfo[key] = metainfo[key]
- tinfo['size'] = file_length
- tinfo['hash'] = sha.sha(bencode(info)).hexdigest()
- tinfo['announce'] = metainfo['announce']
- return tinfo
+ "Returns Torrent Info from torrent file"
+ tinfo = {}
+ metainfo = bdecode(data)
+ info = metainfo['info']
+ piece_length = info['piece length']
+ if info.has_key('length'):
+ # let's assume we just have one file
+ file_length = info['length']
+ else:
+ # let's assume we have a directory structure
+ file_length = 0;
+ for f in info['files']:
+ file_length += f['length']
+ for key in info:
+ if key != 'pieces':
+ tinfo[key] = info[key]
+ for key in metainfo:
+ if key != 'info':
+ tinfo[key] = metainfo[key]
+ tinfo['size'] = file_length
+ tinfo['hash'] = sha.sha(bencode(info)).hexdigest()
+ tinfo['announce'] = metainfo['announce']
+ return tinfo
def getTorrentSize(torrentFile):
- "Returns Size of files in torrent file in bytes"
- return getTorrentInfo(torrentFile)['size']
+ "Returns Size of files in torrent file in bytes"
+ return getTorrentInfo(torrentFile)['size']
diff --git a/setup.py b/setup.py
index f9324af..f1728b2 100644
--- a/setup.py
+++ b/setup.py
@@ -1,30 +1,30 @@
#!/usr/bin/env python
-# vi:si:et:sw=2:sts=2:ts=2
+# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
from setuptools import setup, find_packages
setup(
- name="oxutils",
- version="0.1",
+ name="oxutils",
+ version="0.1",
- description="collection of utils used to work with python",
- author="0x",
- author_email="code@0xdb.org",
- url="http://code.0xdb.org/oxutils",
- download_url="http://code.0xdb.org/oxutils/download",
- license="GPLv3",
- packages=find_packages(),
- zip_safe=False,
- install_requires=[
- 'chardet',
- ],
- keywords = [
- ],
- classifiers = [
- 'Development Status :: 3 - Alpha',
- 'Operating System :: OS Independent',
- 'Programming Language :: Python',
- 'Topic :: Software Development :: Libraries :: Python Modules',
- ],
- )
+ description="collection of utils used to work with python",
+ author="0x",
+ author_email="code@0xdb.org",
+ url="http://code.0xdb.org/oxutils",
+ download_url="http://code.0xdb.org/oxutils/download",
+ license="GPLv3",
+ packages=find_packages(),
+ zip_safe=False,
+ install_requires=[
+ 'chardet',
+ ],
+ keywords = [
+ ],
+ classifiers = [
+ 'Development Status :: 3 - Alpha',
+ 'Operating System :: OS Independent',
+ 'Programming Language :: Python',
+ 'Topic :: Software Development :: Libraries :: Python Modules',
+ ],
+)