vi:si:et:sw=4:sts=4:ts=4

This commit is contained in:
j 2008-06-19 11:21:21 +02:00
parent dafe20aa04
commit 4a6e2702b4
11 changed files with 921 additions and 921 deletions

View file

@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# vi:si:et:sw=4:sts=4:ts=4
# 2008
from hashes import *
@ -11,7 +11,7 @@ import cache
#only works if BitTornado is installed
try:
from torrent import *
from torrent import *
except:
pass
pass

View file

@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# vi:si:et:sw=4:sts=4:ts=4
# 2008
import gzip
import StringIO
@ -19,99 +19,99 @@ from net import DEFAULT_HEADERS, getEncoding
cache_timeout = 30*24*60*60 # default is 30 days
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
'''
>>> status('http://google.com')
200
>>> status('http://google.com/mysearch')
404
'''
headers = getHeaders(url, data, headers)
return int(headers['status'])
'''
>>> status('http://google.com')
200
>>> status('http://google.com/mysearch')
404
'''
headers = getHeaders(url, data, headers)
return int(headers['status'])
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
'''
>>> exists('http://google.com')
True
>>> exists('http://google.com/mysearch')
False
'''
s = status(url, data, headers, timeout)
if s >= 200 and s < 400:
return True
return False
'''
>>> exists('http://google.com')
True
>>> exists('http://google.com/mysearch')
False
'''
s = status(url, data, headers, timeout)
if s >= 200 and s < 400:
return True
return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
url_cache_file = "%s.headers" % getUrlCacheFile(url, data, headers)
url_headers = loadUrlCache(url_cache_file, timeout)
if url_headers:
url_headers = simplejson.loads(url_headers)
else:
url_headers = net.getHeaders(url, data, headers)
saveUrlHeaders(url_cache_file, url_headers)
return url_headers
url_cache_file = "%s.headers" % getUrlCacheFile(url, data, headers)
url_headers = loadUrlCache(url_cache_file, timeout)
if url_headers:
url_headers = simplejson.loads(url_headers)
else:
url_headers = net.getHeaders(url, data, headers)
saveUrlHeaders(url_cache_file, url_headers)
return url_headers
def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
url_cache_file = getUrlCacheFile(url, data, headers)
result = loadUrlCache(url_cache_file, timeout)
if not result:
try:
url_headers, result = net.getUrl(url, data, headers, returnHeaders=True)
except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code
url_headers = dict(e.headers)
result = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
saveUrlCache(url_cache_file, result, url_headers)
return result
url_cache_file = getUrlCacheFile(url, data, headers)
result = loadUrlCache(url_cache_file, timeout)
if not result:
try:
url_headers, result = net.getUrl(url, data, headers, returnHeaders=True)
except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code
url_headers = dict(e.headers)
result = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
saveUrlCache(url_cache_file, result, url_headers)
return result
def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl):
data = _getUrl(url, data, headers, timeout)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
data = _getUrl(url, data, headers, timeout)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def getCacheBase():
'cache base is eather ~/.ox/cache or can set via env variable oxCACHE'
return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))
'cache base is eather ~/.ox/cache or can set via env variable oxCACHE'
return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))
def getUrlCacheFile(url, data=None, headers=DEFAULT_HEADERS):
if data:
url_hash = sha.sha(url + '?' + data).hexdigest()
else:
url_hash = sha.sha(url).hexdigest()
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
return os.path.join(getCacheBase(), domain, url_hash[:2], url_hash[2:4], url_hash[4:6], url_hash)
if data:
url_hash = sha.sha(url + '?' + data).hexdigest()
else:
url_hash = sha.sha(url).hexdigest()
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
return os.path.join(getCacheBase(), domain, url_hash[:2], url_hash[2:4], url_hash[4:6], url_hash)
def loadUrlCache(url_cache_file, timeout=cache_timeout):
if timeout == 0:
if timeout == 0:
return None
if os.path.exists(url_cache_file):
ctime = os.stat(url_cache_file).st_ctime
now = time.mktime(time.localtime())
file_age = now-ctime
if timeout < 0 or file_age < timeout:
f = open(url_cache_file)
data = f.read()
f.close()
return data
return None
if os.path.exists(url_cache_file):
ctime = os.stat(url_cache_file).st_ctime
now = time.mktime(time.localtime())
file_age = now-ctime
if timeout < 0 or file_age < timeout:
f = open(url_cache_file)
data = f.read()
f.close()
return data
return None
def saveUrlCache(url_cache_file, data, headers):
folder = os.path.dirname(url_cache_file)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(url_cache_file, 'w')
f.write(data)
f.close()
saveUrlHeaders("%s.headers" % url_cache_file, headers)
folder = os.path.dirname(url_cache_file)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(url_cache_file, 'w')
f.write(data)
f.close()
saveUrlHeaders("%s.headers" % url_cache_file, headers)
def saveUrlHeaders(url_cache_file, headers):
folder = os.path.dirname(url_cache_file)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(url_cache_file, 'w')
f.write(simplejson.dumps(headers))
f.close()
folder = os.path.dirname(url_cache_file)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(url_cache_file, 'w')
f.write(simplejson.dumps(headers))
f.close()

View file

@ -1,208 +1,207 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# vi:si:et:sw=4:sts=4:ts=4
import math
import re
def to36(q):
"""
Converts an integer to base 36 (a useful scheme for human-sayable IDs).
"""
Converts an integer to base 36 (a useful scheme for human-sayable IDs).
>>> to36(35)
'z'
>>> to36(119292)
'2k1o'
>>> int(to36(939387374), 36)
939387374
>>> to36(0)
'0'
>>> to36(-393)
Traceback (most recent call last):
...
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError, "must supply a positive integer"
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
converted = []
while q != 0:
q, r = divmod(q, 36)
converted.insert(0, letters[r])
return "".join(converted) or '0'
>>> to36(35)
'z'
>>> to36(119292)
'2k1o'
>>> int(to36(939387374), 36)
939387374
>>> to36(0)
'0'
>>> to36(-393)
Traceback (most recent call last):
...
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError, "must supply a positive integer"
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
converted = []
while q != 0:
q, r = divmod(q, 36)
converted.insert(0, letters[r])
return "".join(converted) or '0'
def from36(q):
return int(q, 36)
return int(q, 36)
def intValue(strValue, default=''):
try:
val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
except:
val = default
return val
try:
val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
except:
val = default
return val
def test_intValue():
assert intValue('abc23') == '23'
assert intValue(' abc23') == '23'
assert intValue(' abc') == ''
assert intValue('abc23') == '23'
assert intValue(' abc23') == '23'
assert intValue(' abc') == ''
def floatValue(strValue, default=''):
try:
val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
except:
val = default
return val
try:
val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
except:
val = default
return val
def test_floatValue():
print "floatValue"
assert floatValue('abc23.4') == '23.4'
assert floatValue(' abc23.4') == '23.4'
assert floatValue(' abc') == ''
assert floatValue('abc23.4') == '23.4'
assert floatValue(' abc23.4') == '23.4'
assert floatValue(' abc') == ''
def formatNumber(number, longName, shortName):
"""
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
>>> formatNumber(123, 'Byte', 'B')
'123 Bytes'
"""
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
>>> formatNumber(123, 'Byte', 'B')
'123 Bytes'
>>> formatNumber(1234, 'Byte', 'B')
'1 KB'
>>> formatNumber(1234, 'Byte', 'B')
'1 KB'
>>> formatNumber(1234567, 'Byte', 'B')
'1.2 MB'
>>> formatNumber(1234567, 'Byte', 'B')
'1.2 MB'
>>> formatNumber(1234567890, 'Byte', 'B')
'1.15 GB'
>>> formatNumber(1234567890, 'Byte', 'B')
'1.15 GB'
>>> formatNumber(1234567890123456789, 'Byte', 'B')
'1,096.5166 PB'
>>> formatNumber(1234567890123456789, 'Byte', 'B')
'1,096.5166 PB'
"""
if number < 1024:
return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '')
prefix = ['K', 'M', 'G', 'T', 'P']
for i in range(5):
if number < math.pow(1024, i + 2) or i == 4:
n = number / math.pow(1024, i + 1)
return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName)
"""
if number < 1024:
return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '')
prefix = ['K', 'M', 'G', 'T', 'P']
for i in range(5):
if number < math.pow(1024, i + 2) or i == 4:
n = number / math.pow(1024, i + 1)
return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName)
def formatThousands(number, separator = ','):
"""
Return the number with separators (1,000,000)
>>> formatThousands(1)
'1'
>>> formatThousands(1000)
'1,000'
>>> formatThousands(1000000)
'1,000,000'
"""
string = str(number).split('.')
l = []
for i, character in enumerate(reversed(string[0])):
if i and (not (i % 3)):
l.insert(0, separator)
l.insert(0, character)
string[0] = ''.join(l)
return '.'.join(string)
"""
Return the number with separators (1,000,000)
>>> formatThousands(1)
'1'
>>> formatThousands(1000)
'1,000'
>>> formatThousands(1000000)
'1,000,000'
"""
string = str(number).split('.')
l = []
for i, character in enumerate(reversed(string[0])):
if i and (not (i % 3)):
l.insert(0, separator)
l.insert(0, character)
string[0] = ''.join(l)
return '.'.join(string)
def formatBits(number):
return formatNumber(number, 'bit', 'b')
return formatNumber(number, 'bit', 'b')
def formatBytes(number):
return formatNumber(number, 'byte', 'B')
return formatNumber(number, 'byte', 'B')
def formatPixels(number):
return formatNumber(number, 'pixel', 'px')
return formatNumber(number, 'pixel', 'px')
def plural(amount, unit, plural='s'):
'''
>>> plural(1, 'unit')
'1 unit'
>>> plural(2, 'unit')
'2 units'
'''
if abs(amount) != 1:
if plural == 's':
unit = unit + plural
else: unit = plural
return "%s %s" % (formatThousands(amount), unit)
'''
>>> plural(1, 'unit')
'1 unit'
>>> plural(2, 'unit')
'2 units'
'''
if abs(amount) != 1:
if plural == 's':
unit = unit + plural
else: unit = plural
return "%s %s" % (formatThousands(amount), unit)
def ms2runtime(ms):
'''
>>> ms2runtime(5000)
'5 seconds'
>>> ms2runtime(500000)
'8 minutes 20 seconds'
>>> ms2runtime(50000000)
'13 hours 53 minutes 20 seconds'
>>> ms2runtime(50000000-20000)
'13 hours 53 minutes'
'''
seconds = int(ms / 1000)
years = 0
days = 0
hours = 0
minutes = 0
if seconds >= 60:
minutes = int(seconds / 60)
seconds = seconds % 60
if minutes >= 60:
hours = int(minutes / 60)
minutes = minutes % 60
if hours >= 24:
days = int(hours / 24)
hours = hours % 24
if days >= 365:
years = int(days / 365)
days = days % 365
runtimeString = (plural(years, 'year'), plural(days, 'day'),
plural(hours,'hour'), plural(minutes, 'minute'), plural(seconds, 'second'))
runtimeString = filter(lambda x: not x.startswith('0'), runtimeString)
return " ".join(runtimeString).strip()
'''
>>> ms2runtime(5000)
'5 seconds'
>>> ms2runtime(500000)
'8 minutes 20 seconds'
>>> ms2runtime(50000000)
'13 hours 53 minutes 20 seconds'
>>> ms2runtime(50000000-20000)
'13 hours 53 minutes'
'''
seconds = int(ms / 1000)
years = 0
days = 0
hours = 0
minutes = 0
if seconds >= 60:
minutes = int(seconds / 60)
seconds = seconds % 60
if minutes >= 60:
hours = int(minutes / 60)
minutes = minutes % 60
if hours >= 24:
days = int(hours / 24)
hours = hours % 24
if days >= 365:
years = int(days / 365)
days = days % 365
runtimeString = (plural(years, 'year'), plural(days, 'day'),
plural(hours,'hour'), plural(minutes, 'minute'), plural(seconds, 'second'))
runtimeString = filter(lambda x: not x.startswith('0'), runtimeString)
return " ".join(runtimeString).strip()
def ms2playtime(ms):
'''
>>> ms2playtime(5000)
'00:05'
>>> ms2playtime(500000)
'08:20'
>>> ms2playtime(50000000)
'13:53:20'
'''
it = int(ms / 1000)
ms = ms - it*1000
ss = it % 60
mm = ((it-ss)/60) % 60
hh = ((it-(mm*60)-ss)/3600) % 60
if hh:
playtime= "%02d:%02d:%02d" % (hh, mm, ss)
else:
playtime= "%02d:%02d" % (mm, ss)
return playtime
'''
>>> ms2playtime(5000)
'00:05'
>>> ms2playtime(500000)
'08:20'
>>> ms2playtime(50000000)
'13:53:20'
'''
it = int(ms / 1000)
ms = ms - it*1000
ss = it % 60
mm = ((it-ss)/60) % 60
hh = ((it-(mm*60)-ss)/3600) % 60
if hh:
playtime= "%02d:%02d:%02d" % (hh, mm, ss)
else:
playtime= "%02d:%02d" % (mm, ss)
return playtime
def ms2time(ms):
'''
>>> ms2time(44592123)
'12:23:12.123'
'''
it = int(ms / 1000)
ms = ms - it*1000
ss = it % 60
mm = ((it-ss)/60) % 60
hh = ((it-(mm*60)-ss)/3600) % 60
return "%d:%02d:%02d.%03d" % (hh, mm, ss, ms)
'''
>>> ms2time(44592123)
'12:23:12.123'
'''
it = int(ms / 1000)
ms = ms - it*1000
ss = it % 60
mm = ((it-ss)/60) % 60
hh = ((it-(mm*60)-ss)/3600) % 60
return "%d:%02d:%02d.%03d" % (hh, mm, ss, ms)
def time2ms(timeString):
'''
>>> time2ms('12:23:12.123')
44592123
'''
ms = 0.0
p = timeString.split(':')
for i in range(len(p)):
ms = ms * 60 + float(p[i])
return int(ms * 1000)
'''
>>> time2ms('12:23:12.123')
44592123
'''
ms = 0.0
p = timeString.split(':')
for i in range(len(p)):
ms = ms * 60 + float(p[i])
return int(ms * 1000)
def shiftTime(offset, timeString):
newTime = time2ms(timeString) + offset
return ms2time(newTime)
newTime = time2ms(timeString) + offset
return ms2time(newTime)

View file

@ -1,17 +1,17 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# vi:si:et:sw=4:sts=4:ts=4
# GPL written 2008 by j@pad.ma
import sha
import os
def sha1sum(filename):
sha1 = sha.new()
file=open(filename)
buffer=file.read(4096)
while buffer:
sha1.update(buffer)
sha1 = sha.new()
file=open(filename)
buffer=file.read(4096)
file.close()
return sha1.hexdigest()
while buffer:
sha1.update(buffer)
buffer=file.read(4096)
file.close()
return sha1.hexdigest()

View file

@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# vi:si:et:sw=4:sts=4:ts=4
# GPL written 2008 by j@pad.ma
import re
import string
@ -26,147 +26,147 @@ trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\
del x # Temporary variable
def escape(html):
'''
Returns the given HTML with ampersands, quotes and carets encoded
'''
Returns the given HTML with ampersands, quotes and carets encoded
>>> escape('html "test" & <brothers>')
'html &quot;test&quot; &amp; &lt;brothers&gt;'
'''
if not isinstance(html, basestring):
html = str(html)
return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')
>>> escape('html "test" & <brothers>')
'html &quot;test&quot; &amp; &lt;brothers&gt;'
'''
if not isinstance(html, basestring):
html = str(html)
return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')
def linebreaks(value):
'''
Converts newlines into <p> and <br />
'''
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
paras = re.split('\n{2,}', value)
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
return '\n\n'.join(paras)
'''
Converts newlines into <p> and <br />
'''
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
paras = re.split('\n{2,}', value)
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
return '\n\n'.join(paras)
def stripTags(value):
"""
Returns the given HTML with all tags stripped
>>> stripTags('some <h2>title</h2> <script>asdfasdf</script>')
'some title asdfasdf'
"""
return re.sub(r'<[^>]*?>', '', value)
"""
Returns the given HTML with all tags stripped
>>> stripTags('some <h2>title</h2> <script>asdfasdf</script>')
'some title asdfasdf'
"""
return re.sub(r'<[^>]*?>', '', value)
def stripSpacesBetweenTags(value):
"Returns the given HTML with spaces between tags normalized to a single space"
return re.sub(r'>\s+<', '> <', value)
"Returns the given HTML with spaces between tags normalized to a single space"
return re.sub(r'>\s+<', '> <', value)
def stripEntities(value):
"Returns the given HTML with all entities (&something;) stripped"
return re.sub(r'&(?:\w+|#\d);', '', value)
"Returns the given HTML with all entities (&something;) stripped"
return re.sub(r'&(?:\w+|#\d);', '', value)
def fixAmpersands(value):
"Returns the given HTML with all unencoded ampersands encoded correctly"
return unencoded_ampersands_re.sub('&amp;', value)
"Returns the given HTML with all unencoded ampersands encoded correctly"
return unencoded_ampersands_re.sub('&amp;', value)
def urlize(text, trim_url_limit=None, nofollow=False):
"""
Converts any URLs in text into clickable links. Works on http://, https:// and
www. links. Links can have trailing punctuation (periods, commas, close-parens)
and leading punctuation (opening parens) and it'll still do the right thing.
"""
Converts any URLs in text into clickable links. Works on http://, https:// and
www. links. Links can have trailing punctuation (periods, commas, close-parens)
and leading punctuation (opening parens) and it'll still do the right thing.
If trim_url_limit is not None, the URLs in link text will be limited to
trim_url_limit characters.
If trim_url_limit is not None, the URLs in link text will be limited to
trim_url_limit characters.
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
"""
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or ''
for i, word in enumerate(words):
match = punctuation_re.match(word)
if match:
lead, middle, trail = match.groups()
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
len(middle) > 0 and middle[0] in string.letters + string.digits and \
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if middle.startswith('http://') or middle.startswith('https://'):
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
and simple_email_re.match(middle):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word:
words[i] = lead + middle + trail
return ''.join(words)
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
"""
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or ''
for i, word in enumerate(words):
match = punctuation_re.match(word)
if match:
lead, middle, trail = match.groups()
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
len(middle) > 0 and middle[0] in string.letters + string.digits and \
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if middle.startswith('http://') or middle.startswith('https://'):
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
and simple_email_re.match(middle):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word:
words[i] = lead + middle + trail
return ''.join(words)
def cleanHtml(text):
"""
Cleans the given HTML. Specifically, it does the following:
* Converts <b> and <i> to <strong> and <em>.
* Encodes all ampersands correctly.
* Removes all "target" attributes from <a> tags.
* Removes extraneous HTML, such as presentational tags that open and
immediately close and <br clear="all">.
* Converts hard-coded bullets into HTML unordered lists.
* Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
bottom of the text.
"""
from text import normalizeNewlines
text = normalizeNewlines(text)
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
text = fixAmpersands(text)
# Remove all target="" attributes from <a> tags.
text = link_target_attribute_re.sub('\\1', text)
# Trim stupid HTML such as <br clear="all">.
text = html_gunk_re.sub('', text)
# Convert hard-coded bullets into HTML unordered lists.
def replace_p_tags(match):
s = match.group().replace('</p>', '</li>')
for d in DOTS:
s = s.replace('<p>%s' % d, '<li>')
return '<ul>\n%s\n</ul>' % s
text = hard_coded_bullets_re.sub(replace_p_tags, text)
# Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom of the text.
text = trailing_empty_content_re.sub('', text)
return text
"""
Cleans the given HTML. Specifically, it does the following:
* Converts <b> and <i> to <strong> and <em>.
* Encodes all ampersands correctly.
* Removes all "target" attributes from <a> tags.
* Removes extraneous HTML, such as presentational tags that open and
immediately close and <br clear="all">.
* Converts hard-coded bullets into HTML unordered lists.
* Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
bottom of the text.
"""
from text import normalizeNewlines
text = normalizeNewlines(text)
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
text = fixAmpersands(text)
# Remove all target="" attributes from <a> tags.
text = link_target_attribute_re.sub('\\1', text)
# Trim stupid HTML such as <br clear="all">.
text = html_gunk_re.sub('', text)
# Convert hard-coded bullets into HTML unordered lists.
def replace_p_tags(match):
s = match.group().replace('</p>', '</li>')
for d in DOTS:
s = s.replace('<p>%s' % d, '<li>')
return '<ul>\n%s\n</ul>' % s
text = hard_coded_bullets_re.sub(replace_p_tags, text)
# Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom of the text.
text = trailing_empty_content_re.sub('', text)
return text
# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def decodeHtml(html):
"""
>>> decodeHtml('me &amp; you and &#36;&#38;%')
u'me & you and $&%'
"""
if type(html) != unicode:
html = unicode(html)[:]
if type(html) is unicode:
uchr = unichr
else:
uchr = lambda value: value > 255 and unichr(value) or chr(value)
def entitydecode(match, uchr=uchr):
entity = match.group(1)
if entity.startswith('#x'):
return uchr(int(entity[2:], 16))
elif entity.startswith('#'):
return uchr(int(entity[1:]))
elif entity in name2codepoint:
return uchr(name2codepoint[entity])
"""
>>> decodeHtml('me &amp; you and &#36;&#38;%')
u'me & you and $&%'
"""
if type(html) != unicode:
html = unicode(html)[:]
if type(html) is unicode:
uchr = unichr
else:
return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
uchr = lambda value: value > 255 and unichr(value) or chr(value)
def entitydecode(match, uchr=uchr):
entity = match.group(1)
if entity.startswith('#x'):
return uchr(int(entity[2:], 16))
elif entity.startswith('#'):
return uchr(int(entity[1:]))
elif entity in name2codepoint:
return uchr(name2codepoint[entity])
else:
return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
def highlight(text, query, hlClass="hl"):
"""
>>> highlight('me &amp; you and &#36;&#38;%', 'and')
'me &amp; you <span class="hl">and</span> &#36;&#38;%'
"""
if query:
text = text.replace('<br />', '|')
query = re.escape(query).replace('\ ', '.')
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
for i in m:
text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
text = text.replace('|', '<br />')
return text
"""
>>> highlight('me &amp; you and &#36;&#38;%', 'and')
'me &amp; you <span class="hl">and</span> &#36;&#38;%'
"""
if query:
text = text.replace('<br />', '|')
query = re.escape(query).replace('\ ', '.')
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
for i in m:
text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
text = text.replace('|', '<br />')
return text

View file

@ -1,236 +1,236 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# vi:si:et:sw=4:sts=4:ts=4
_iso639_languages = [
("Unknown", "", "", "und"),
("Afar", "", "aa", "aar"),
("Abkhazian", "", "ab", "abk"),
("Afrikaans", "", "af", "afr"),
("Akan", "", "ak", "aka"),
("Albanian", "", "sq", "sqi"),
("Amharic", "", "am", "amh"),
("Arabic", "", "ar", "ara"),
("Aragonese", "", "an", "arg"),
("Armenian", "", "hy", "hye"),
("Assamese", "", "as", "asm"),
("Avaric", "", "av", "ava"),
("Avestan", "", "ae", "ave"),
("Aymara", "", "ay", "aym"),
("Azerbaijani", "", "az", "aze"),
("Bashkir", "", "ba", "bak"),
("Bambara", "", "bm", "bam"),
("Basque", "", "eu", "eus"),
("Belarusian", "", "be", "bel"),
("Bengali", "", "bn", "ben"),
("Bihari", "", "bh", "bih"),
("Bislama", "", "bi", "bis"),
("Bosnian", "", "bs", "bos"),
("Breton", "", "br", "bre"),
("Bulgarian", "", "bg", "bul"),
("Burmese", "", "my", "mya"),
("Catalan", "", "ca", "cat"),
("Chamorro", "", "ch", "cha"),
("Chechen", "", "ce", "che"),
("Chinese", "", "zh", "zho"),
("Church Slavic", "", "cu", "chu"),
("Chuvash", "", "cv", "chv"),
("Cornish", "", "kw", "cor"),
("Corsican", "", "co", "cos"),
("Cree", "", "cr", "cre"),
("Czech", "", "cs", "ces"),
("Danish", "Dansk", "da", "dan"),
("Divehi", "", "dv", "div"),
("Dutch", "Nederlands", "nl", "nld"),
("Dzongkha", "", "dz", "dzo"),
("English", "English", "en", "eng"),
("Esperanto", "", "eo", "epo"),
("Estonian", "", "et", "est"),
("Ewe", "", "ee", "ewe"),
("Faroese", "", "fo", "fao"),
("Fijian", "", "fj", "fij"),
("Finnish", "Suomi", "fi", "fin"),
("French", "Francais", "fr", "fra"),
("Western Frisian", "", "fy", "fry"),
("Fulah", "", "ff", "ful"),
("Georgian", "", "ka", "kat"),
("German", "Deutsch", "de", "deu"),
("Gaelic (Scots)", "", "gd", "gla"),
("Irish", "", "ga", "gle"),
("Galician", "", "gl", "glg"),
("Manx", "", "gv", "glv"),
("Greek, Modern", "", "el", "ell"),
("Guarani", "", "gn", "grn"),
("Gujarati", "", "gu", "guj"),
("Haitian", "", "ht", "hat"),
("Hausa", "", "ha", "hau"),
("Hebrew", "", "he", "heb"),
("Herero", "", "hz", "her"),
("Hindi", "", "hi", "hin"),
("Hiri Motu", "", "ho", "hmo"),
("Hungarian", "Magyar", "hu", "hun"),
("Igbo", "", "ig", "ibo"),
("Icelandic", "Islenska", "is", "isl"),
("Ido", "", "io", "ido"),
("Sichuan Yi", "", "ii", "iii"),
("Inuktitut", "", "iu", "iku"),
("Interlingue", "", "ie", "ile"),
("Interlingua", "", "ia", "ina"),
("Indonesian", "", "id", "ind"),
("Inupiaq", "", "ik", "ipk"),
("Italian", "Italiano", "it", "ita"),
("Javanese", "", "jv", "jav"),
("Japanese", "", "ja", "jpn"),
("Kalaallisut (Greenlandic)", "", "kl", "kal"),
("Kannada", "", "kn", "kan"),
("Kashmiri", "", "ks", "kas"),
("Kanuri", "", "kr", "kau"),
("Kazakh", "", "kk", "kaz"),
("Central Khmer", "", "km", "khm"),
("Kikuyu", "", "ki", "kik"),
("Kinyarwanda", "", "rw", "kin"),
("Kirghiz", "", "ky", "kir"),
("Komi", "", "kv", "kom"),
("Kongo", "", "kg", "kon"),
("Korean", "", "ko", "kor"),
("Kuanyama", "", "kj", "kua"),
("Kurdish", "", "ku", "kur"),
("Lao", "", "lo", "lao"),
("Latin", "", "la", "lat"),
("Latvian", "", "lv", "lav"),
("Limburgan", "", "li", "lim"),
("Lingala", "", "ln", "lin"),
("Lithuanian", "", "lt", "lit"),
("Luxembourgish", "", "lb", "ltz"),
("Luba-Katanga", "", "lu", "lub"),
("Ganda", "", "lg", "lug"),
("Macedonian", "", "mk", "mkd"),
("Marshallese", "", "mh", "mah"),
("Malayalam", "", "ml", "mal"),
("Maori", "", "mi", "mri"),
("Marathi", "", "mr", "mar"),
("Malay", "", "ms", "msa"),
("Malagasy", "", "mg", "mlg"),
("Maltese", "", "mt", "mlt"),
("Moldavian", "", "mo", "mol"),
("Mongolian", "", "mn", "mon"),
("Nauru", "", "na", "nau"),
("Navajo", "", "nv", "nav"),
("Ndebele, South", "", "nr", "nbl"),
("Ndebele, North", "", "nd", "nde"),
("Ndonga", "", "ng", "ndo"),
("Nepali", "", "ne", "nep"),
("Norwegian Nynorsk", "", "nn", "nno"),
("Norwegian Bokmål", "", "nb", "nob"),
("Norwegian", "Norsk", "no", "nor"),
("Chichewa; Nyanja", "", "ny", "nya"),
("Occitan (post 1500); Provençal", "", "oc", "oci"),
("Ojibwa", "", "oj", "oji"),
("Oriya", "", "or", "ori"),
("Oromo", "", "om", "orm"),
("Ossetian; Ossetic", "", "os", "oss"),
("Panjabi", "", "pa", "pan"),
("Persian", "", "fa", "fas"),
("Pali", "", "pi", "pli"),
("Polish", "", "pl", "pol"),
("Portuguese", "Portugues", "pt", "por"),
("Pushto", "", "ps", "pus"),
("Quechua", "", "qu", "que"),
("Romansh", "", "rm", "roh"),
("Romanian", "", "ro", "ron"),
("Rundi", "", "rn", "run"),
("Russian", "", "ru", "rus"),
("Sango", "", "sg", "sag"),
("Sanskrit", "", "sa", "san"),
("Serbian", "", "sr", "srp"),
("Croatian", "Hrvatski", "hr", "hrv"),
("Sinhala", "", "si", "sin"),
("Slovak", "", "sk", "slk"),
("Slovenian", "", "sl", "slv"),
("Northern Sami", "", "se", "sme"),
("Samoan", "", "sm", "smo"),
("Shona", "", "sn", "sna"),
("Sindhi", "", "sd", "snd"),
("Somali", "", "so", "som"),
("Sotho, Southern", "", "st", "sot"),
("Spanish", "Espanol", "es", "spa"),
("Sardinian", "", "sc", "srd"),
("Swati", "", "ss", "ssw"),
("Sundanese", "", "su", "sun"),
("Swahili", "", "sw", "swa"),
("Swedish", "Svenska", "sv", "swe"),
("Tahitian", "", "ty", "tah"),
("Tamil", "", "ta", "tam"),
("Tatar", "", "tt", "tat"),
("Telugu", "", "te", "tel"),
("Tajik", "", "tg", "tgk"),
("Tagalog", "", "tl", "tgl"),
("Thai", "", "th", "tha"),
("Tibetan", "", "bo", "bod"),
("Tigrinya", "", "ti", "tir"),
("Tonga (Tonga Islands)", "", "to", "ton"),
("Tswana", "", "tn", "tsn"),
("Tsonga", "", "ts", "tso"),
("Turkmen", "", "tk", "tuk"),
("Turkish", "", "tr", "tur"),
("Twi", "", "tw", "twi"),
("Uighur", "", "ug", "uig"),
("Ukrainian", "", "uk", "ukr"),
("Urdu", "", "ur", "urd"),
("Uzbek", "", "uz", "uzb"),
("Venda", "", "ve", "ven"),
("Vietnamese", "", "vi", "vie"),
("Volapük", "", "vo", "vol"),
("Welsh", "", "cy", "cym"),
("Walloon", "", "wa", "wln"),
("Wolof", "", "wo", "wol"),
("Xhosa", "", "xh", "xho"),
("Yiddish", "", "yi", "yid"),
("Yoruba", "", "yo", "yor"),
("Zhuang", "", "za", "zha"),
("Zulu", "", "zu", "zul"),
("Unknown", "", "", "und"),
("Afar", "", "aa", "aar"),
("Abkhazian", "", "ab", "abk"),
("Afrikaans", "", "af", "afr"),
("Akan", "", "ak", "aka"),
("Albanian", "", "sq", "sqi"),
("Amharic", "", "am", "amh"),
("Arabic", "", "ar", "ara"),
("Aragonese", "", "an", "arg"),
("Armenian", "", "hy", "hye"),
("Assamese", "", "as", "asm"),
("Avaric", "", "av", "ava"),
("Avestan", "", "ae", "ave"),
("Aymara", "", "ay", "aym"),
("Azerbaijani", "", "az", "aze"),
("Bashkir", "", "ba", "bak"),
("Bambara", "", "bm", "bam"),
("Basque", "", "eu", "eus"),
("Belarusian", "", "be", "bel"),
("Bengali", "", "bn", "ben"),
("Bihari", "", "bh", "bih"),
("Bislama", "", "bi", "bis"),
("Bosnian", "", "bs", "bos"),
("Breton", "", "br", "bre"),
("Bulgarian", "", "bg", "bul"),
("Burmese", "", "my", "mya"),
("Catalan", "", "ca", "cat"),
("Chamorro", "", "ch", "cha"),
("Chechen", "", "ce", "che"),
("Chinese", "", "zh", "zho"),
("Church Slavic", "", "cu", "chu"),
("Chuvash", "", "cv", "chv"),
("Cornish", "", "kw", "cor"),
("Corsican", "", "co", "cos"),
("Cree", "", "cr", "cre"),
("Czech", "", "cs", "ces"),
("Danish", "Dansk", "da", "dan"),
("Divehi", "", "dv", "div"),
("Dutch", "Nederlands", "nl", "nld"),
("Dzongkha", "", "dz", "dzo"),
("English", "English", "en", "eng"),
("Esperanto", "", "eo", "epo"),
("Estonian", "", "et", "est"),
("Ewe", "", "ee", "ewe"),
("Faroese", "", "fo", "fao"),
("Fijian", "", "fj", "fij"),
("Finnish", "Suomi", "fi", "fin"),
("French", "Francais", "fr", "fra"),
("Western Frisian", "", "fy", "fry"),
("Fulah", "", "ff", "ful"),
("Georgian", "", "ka", "kat"),
("German", "Deutsch", "de", "deu"),
("Gaelic (Scots)", "", "gd", "gla"),
("Irish", "", "ga", "gle"),
("Galician", "", "gl", "glg"),
("Manx", "", "gv", "glv"),
("Greek, Modern", "", "el", "ell"),
("Guarani", "", "gn", "grn"),
("Gujarati", "", "gu", "guj"),
("Haitian", "", "ht", "hat"),
("Hausa", "", "ha", "hau"),
("Hebrew", "", "he", "heb"),
("Herero", "", "hz", "her"),
("Hindi", "", "hi", "hin"),
("Hiri Motu", "", "ho", "hmo"),
("Hungarian", "Magyar", "hu", "hun"),
("Igbo", "", "ig", "ibo"),
("Icelandic", "Islenska", "is", "isl"),
("Ido", "", "io", "ido"),
("Sichuan Yi", "", "ii", "iii"),
("Inuktitut", "", "iu", "iku"),
("Interlingue", "", "ie", "ile"),
("Interlingua", "", "ia", "ina"),
("Indonesian", "", "id", "ind"),
("Inupiaq", "", "ik", "ipk"),
("Italian", "Italiano", "it", "ita"),
("Javanese", "", "jv", "jav"),
("Japanese", "", "ja", "jpn"),
("Kalaallisut (Greenlandic)", "", "kl", "kal"),
("Kannada", "", "kn", "kan"),
("Kashmiri", "", "ks", "kas"),
("Kanuri", "", "kr", "kau"),
("Kazakh", "", "kk", "kaz"),
("Central Khmer", "", "km", "khm"),
("Kikuyu", "", "ki", "kik"),
("Kinyarwanda", "", "rw", "kin"),
("Kirghiz", "", "ky", "kir"),
("Komi", "", "kv", "kom"),
("Kongo", "", "kg", "kon"),
("Korean", "", "ko", "kor"),
("Kuanyama", "", "kj", "kua"),
("Kurdish", "", "ku", "kur"),
("Lao", "", "lo", "lao"),
("Latin", "", "la", "lat"),
("Latvian", "", "lv", "lav"),
("Limburgan", "", "li", "lim"),
("Lingala", "", "ln", "lin"),
("Lithuanian", "", "lt", "lit"),
("Luxembourgish", "", "lb", "ltz"),
("Luba-Katanga", "", "lu", "lub"),
("Ganda", "", "lg", "lug"),
("Macedonian", "", "mk", "mkd"),
("Marshallese", "", "mh", "mah"),
("Malayalam", "", "ml", "mal"),
("Maori", "", "mi", "mri"),
("Marathi", "", "mr", "mar"),
("Malay", "", "ms", "msa"),
("Malagasy", "", "mg", "mlg"),
("Maltese", "", "mt", "mlt"),
("Moldavian", "", "mo", "mol"),
("Mongolian", "", "mn", "mon"),
("Nauru", "", "na", "nau"),
("Navajo", "", "nv", "nav"),
("Ndebele, South", "", "nr", "nbl"),
("Ndebele, North", "", "nd", "nde"),
("Ndonga", "", "ng", "ndo"),
("Nepali", "", "ne", "nep"),
("Norwegian Nynorsk", "", "nn", "nno"),
("Norwegian Bokmål", "", "nb", "nob"),
("Norwegian", "Norsk", "no", "nor"),
("Chichewa; Nyanja", "", "ny", "nya"),
("Occitan (post 1500); Provençal", "", "oc", "oci"),
("Ojibwa", "", "oj", "oji"),
("Oriya", "", "or", "ori"),
("Oromo", "", "om", "orm"),
("Ossetian; Ossetic", "", "os", "oss"),
("Panjabi", "", "pa", "pan"),
("Persian", "", "fa", "fas"),
("Pali", "", "pi", "pli"),
("Polish", "", "pl", "pol"),
("Portuguese", "Portugues", "pt", "por"),
("Pushto", "", "ps", "pus"),
("Quechua", "", "qu", "que"),
("Romansh", "", "rm", "roh"),
("Romanian", "", "ro", "ron"),
("Rundi", "", "rn", "run"),
("Russian", "", "ru", "rus"),
("Sango", "", "sg", "sag"),
("Sanskrit", "", "sa", "san"),
("Serbian", "", "sr", "srp"),
("Croatian", "Hrvatski", "hr", "hrv"),
("Sinhala", "", "si", "sin"),
("Slovak", "", "sk", "slk"),
("Slovenian", "", "sl", "slv"),
("Northern Sami", "", "se", "sme"),
("Samoan", "", "sm", "smo"),
("Shona", "", "sn", "sna"),
("Sindhi", "", "sd", "snd"),
("Somali", "", "so", "som"),
("Sotho, Southern", "", "st", "sot"),
("Spanish", "Espanol", "es", "spa"),
("Sardinian", "", "sc", "srd"),
("Swati", "", "ss", "ssw"),
("Sundanese", "", "su", "sun"),
("Swahili", "", "sw", "swa"),
("Swedish", "Svenska", "sv", "swe"),
("Tahitian", "", "ty", "tah"),
("Tamil", "", "ta", "tam"),
("Tatar", "", "tt", "tat"),
("Telugu", "", "te", "tel"),
("Tajik", "", "tg", "tgk"),
("Tagalog", "", "tl", "tgl"),
("Thai", "", "th", "tha"),
("Tibetan", "", "bo", "bod"),
("Tigrinya", "", "ti", "tir"),
("Tonga (Tonga Islands)", "", "to", "ton"),
("Tswana", "", "tn", "tsn"),
("Tsonga", "", "ts", "tso"),
("Turkmen", "", "tk", "tuk"),
("Turkish", "", "tr", "tur"),
("Twi", "", "tw", "twi"),
("Uighur", "", "ug", "uig"),
("Ukrainian", "", "uk", "ukr"),
("Urdu", "", "ur", "urd"),
("Uzbek", "", "uz", "uzb"),
("Venda", "", "ve", "ven"),
("Vietnamese", "", "vi", "vie"),
("Volapük", "", "vo", "vol"),
("Welsh", "", "cy", "cym"),
("Walloon", "", "wa", "wln"),
("Wolof", "", "wo", "wol"),
("Xhosa", "", "xh", "xho"),
("Yiddish", "", "yi", "yid"),
("Yoruba", "", "yo", "yor"),
("Zhuang", "", "za", "zha"),
("Zulu", "", "zu", "zul"),
]
def codeToLang(code):
code = code.lower()
if len(code) == 2:
for l in _iso639_languages:
if l[2] == code:
return l[0]
elif len(code) == 3:
for l in _iso639_languages:
if l[3] == code:
return l[0]
return None
code = code.lower()
if len(code) == 2:
for l in _iso639_languages:
if l[2] == code:
return l[0]
elif len(code) == 3:
for l in _iso639_languages:
if l[3] == code:
return l[0]
return None
def langTo3Code(lang):
lang = englishName(lang)
if lang:
lang=lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[3]
return None
lang = englishName(lang)
if lang:
lang=lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[3]
return None
def langTo2Code(lang):
lang = englishName(lang)
if lang:
lang=lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[2]
return None
lang = englishName(lang)
if lang:
lang=lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[2]
return None
def langCode2To3(code):
langTo3Code(codeToLang(code))
langTo3Code(codeToLang(code))
def langCode3To2(code):
langTo2Code(codeToLang(code))
langTo2Code(codeToLang(code))
def englishName(lang):
lang = lang.lower()
for l in _iso639_languages:
if l[1].lower() == lang:
return l[0]
return None
lang = lang.lower()
for l in _iso639_languages:
if l[1].lower() == lang:
return l[0]
return None

View file

@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# vi:si:et:sw=4:sts=4:ts=4
import gzip
import StringIO
import urllib
@ -10,64 +10,64 @@ from chardet.universaldetector import UniversalDetector
# Default headers for HTTP requests.
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9) Gecko/2008061015 Firefox/3.0',
'Accept-Encoding': 'gzip'
'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9) Gecko/2008061015 Firefox/3.0',
'Accept-Encoding': 'gzip'
}
def status(url, data=None, headers=DEFAULT_HEADERS):
try:
f = openUrl(url, data, headers)
s = f.code
except urllib2.HTTPError, e:
s = e.code
return s
try:
f = openUrl(url, data, headers)
s = f.code
except urllib2.HTTPError, e:
s = e.code
return s
def exists(url, data=None, headers=DEFAULT_HEADERS):
s = status(url, data, headers)
if s >= 200 and s < 400:
return True
return False
s = status(url, data, headers)
if s >= 200 and s < 400:
return True
return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
try:
f = openUrl(url, data, headers)
f.headers['Status'] = "%s" % f.code
headers = f.headers
f.close()
except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code
headers = e.headers
return dict(headers)
try:
f = openUrl(url, data, headers)
f.headers['Status'] = "%s" % f.code
headers = f.headers
f.close()
except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code
headers = e.headers
return dict(headers)
def openUrl(url, data=None, headers=DEFAULT_HEADERS):
url = url.replace(' ', '%20')
req = urllib2.Request(url, data, headers)
return urllib2.urlopen(req)
url = url.replace(' ', '%20')
req = urllib2.Request(url, data, headers)
return urllib2.urlopen(req)
def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
f = openUrl(url, data, headers)
data = f.read()
f.close()
if f.headers.get('content-encoding', None) == 'gzip':
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
if returnHeaders:
f.headers['Status'] = "%s" % f.code
return dict(f.headers), data
return data
f = openUrl(url, data, headers)
data = f.read()
f.close()
if f.headers.get('content-encoding', None) == 'gzip':
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
if returnHeaders:
f.headers['Status'] = "%s" % f.code
return dict(f.headers), data
return data
def getUrlUnicode(url):
data = getUrl(url)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
data = getUrl(url)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def getEncoding(data):
detector = UniversalDetector()
for line in data.split('\n'):
detector.feed(line)
if detector.done:
break
detector.close()
return detector.result['encoding']
detector = UniversalDetector()
for line in data.split('\n'):
detector.feed(line)
if detector.done:
break
detector.close()
return detector.result['encoding']

View file

@ -1,79 +1,79 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# vi:si:et:sw=4:sts=4:ts=4
import re
_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
u'\xd4\xef', u'\xcf\xe9')
"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
u'\xd4\xef', u'\xcf\xe9')
# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
if article[-1] not in ("'", '-'): article += ' '
_spArticles.append(article)
if article[-1] not in ("'", '-'): article += ' '
_spArticles.append(article)
def canonicalTitle(title):
"""Return the title in the canonic format 'Movie Title, The'.
>>> canonicalTitle('The Movie Title')
'Movie Title, The'
"""
try:
if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
except IndexError: pass
ltitle = title.lower()
for article in _spArticles:
if ltitle.startswith(article):
lart = len(article)
title = '%s, %s' % (title[lart:], title[:lart])
if article[-1] == ' ': title = title[:-1]
break
## XXX: an attempt using a dictionary lookup.
##for artSeparator in (' ', "'", '-'):
## article = _articlesDict.get(ltitle.split(artSeparator)[0])
## if article is not None:
## lart = len(article)
## # check titles like "una", "I'm Mad" and "L'abbacchio".
## if title[lart:] == '' or (artSeparator != ' ' and
## title[lart:][1] != artSeparator): continue
## title = '%s, %s' % (title[lart:], title[:lart])
## if artSeparator == ' ': title = title[1:]
## break
return title
"""Return the title in the canonic format 'Movie Title, The'.
>>> canonicalTitle('The Movie Title')
'Movie Title, The'
"""
try:
if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
except IndexError: pass
ltitle = title.lower()
for article in _spArticles:
if ltitle.startswith(article):
lart = len(article)
title = '%s, %s' % (title[lart:], title[:lart])
if article[-1] == ' ': title = title[:-1]
break
## XXX: an attempt using a dictionary lookup.
##for artSeparator in (' ', "'", '-'):
## article = _articlesDict.get(ltitle.split(artSeparator)[0])
## if article is not None:
## lart = len(article)
## # check titles like "una", "I'm Mad" and "L'abbacchio".
## if title[lart:] == '' or (artSeparator != ' ' and
## title[lart:][1] != artSeparator): continue
## title = '%s, %s' % (title[lart:], title[:lart])
## if artSeparator == ' ': title = title[1:]
## break
return title
def normalizeTitle(title):
"""Return the title in the normal "The Title" format.
"""Return the title in the normal "The Title" format.
>>> normalizeTitle('Movie Title, The')
'The Movie Title'
"""
stitle = title.split(', ')
if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
sep = ' '
if stitle[-1][-1] in ("'", '-'): sep = ''
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title
>>> normalizeTitle('Movie Title, The')
'The Movie Title'
"""
stitle = title.split(', ')
if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
sep = ' '
if stitle[-1][-1] in ("'", '-'): sep = ''
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title
def normalizeImdbId(imdbId):
"""Return 7 digit imdbId.
"""Return 7 digit imdbId.
>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
'0159206'
>>> normalizeImdbId(159206)
'0159206'
>>> normalizeImdbId('tt0159206')
'0159206'
"""
if isinstance(imdbId, basestring):
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
elif isinstance(imdbId, int):
imdbId = "%07d" % imdbId
return imdbId
>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
'0159206'
>>> normalizeImdbId(159206)
'0159206'
>>> normalizeImdbId('tt0159206')
'0159206'
"""
if isinstance(imdbId, basestring):
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
elif isinstance(imdbId, int):
imdbId = "%07d" % imdbId
return imdbId
# Common suffixes in surnames.

View file

@ -1,216 +1,216 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
# vi:si:et:sw=4:sts=4:ts=4
# GPL written 2008 by j@pad.ma
import re
def findRe(string, regexp):
result = re.compile(regexp, re.DOTALL).findall(string)
if result:
return result[0].strip()
return ''
result = re.compile(regexp, re.DOTALL).findall(string)
if result:
return result[0].strip()
return ''
def findString(string, string0='', string1 = ''):
"""Return the string between string0 and string1.
"""Return the string between string0 and string1.
If string0 or string1 is left out, begining or end of string is used.
If string0 or string1 is left out, begining or end of string is used.
>>> findString('i am not there', string1=' not there')
'i am'
>>> findString('i am not there', string1=' not there')
'i am'
>>> findString('i am not there', 'i am ', ' there')
'not'
>>> findString('i am not there', 'i am ', ' there')
'not'
>>> findString('i am not there', 'i am not t')
'here'
>>> findString('i am not there', 'i am not t')
'here'
"""
if string0:
string0 = re.escape(string0)
else:
string0 = '^'
if string1:
string1 = re.escape(string1)
else:
string1 = '$'
return findRegexp(string, string0 + '(.*?)' + string1)
"""
if string0:
string0 = re.escape(string0)
else:
string0 = '^'
if string1:
string1 = re.escape(string1)
else:
string1 = '$'
return findRegexp(string, string0 + '(.*?)' + string1)
# Capitalizes the first letter of a string.
capfirst = lambda x: x and x[0].upper() + x[1:]
def removeSpecialCharacters(text):
"""
Removes special characters inserted by Word.
"""
text = text.replace(u'\u2013', '-')
text = text.replace(u'\u2026O', "'")
text = text.replace(u'\u2019', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "-")
return text
"""
Removes special characters inserted by Word.
"""
text = text.replace(u'\u2013', '-')
text = text.replace(u'\u2026O', "'")
text = text.replace(u'\u2019', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "-")
return text
def wrap(text, width):
"""
A word-wrap function that preserves existing line breaks and most spaces in
the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
"""
return reduce(lambda line, word, width=width: '%s%s%s' %
(line,
' \n'[(len(line[line.rfind('\n')+1:])
+ len(word.split('\n',1)[0]
) >= width)],
word),
text.split(' ')
)
"""
A word-wrap function that preserves existing line breaks and most spaces in
the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
"""
return reduce(lambda line, word, width=width: '%s%s%s' %
(line,
' \n'[(len(line[line.rfind('\n')+1:])
+ len(word.split('\n',1)[0]
) >= width)],
word),
text.split(' ')
)
def truncateString(s, num):
"""Truncates a string after a certain number of chacters, but ends with a word
"""Truncates a string after a certain number of chacters, but ends with a word
>>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23)
'Truncates a string...'
>>> truncateString('Truncates a string', 23)
'Truncates a string'
>>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23)
'Truncates a string...'
>>> truncateString('Truncates a string', 23)
'Truncates a string'
"""
length = int(num)
if len(s) <= length:
return s
words = s.split()
ts = ""
while words and len(ts) + len(words[0]) < length:
ts += " " + words.pop(0)
if words:
ts += "..."
return ts.strip()
"""
length = int(num)
if len(s) <= length:
return s
words = s.split()
ts = ""
while words and len(ts) + len(words[0]) < length:
ts += " " + words.pop(0)
if words:
ts += "..."
return ts.strip()
def trimString(string, num):
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
>>> trimString('Truncates a string after a certain number of chacters', 23)
'Truncates ...f chacters'
>>> trimString('Truncates a string', 23)
'Truncates a string'
"""
if len(string) > num:
string = string[:num - 13] + '...' + string[-10:]
return string
>>> trimString('Truncates a string after a certain number of chacters', 23)
'Truncates ...f chacters'
>>> trimString('Truncates a string', 23)
'Truncates a string'
"""
if len(string) > num:
string = string[:num - 13] + '...' + string[-10:]
return string
def truncateWords(s, num):
"Truncates a string after a certain number of words."
length = int(num)
words = s.split()
if len(words) > length:
words = words[:length]
if not words[-1].endswith('...'):
words.append('...')
return ' '.join(words)
"Truncates a string after a certain number of words."
length = int(num)
words = s.split()
if len(words) > length:
words = words[:length]
if not words[-1].endswith('...'):
words.append('...')
return ' '.join(words)
def getValidFilename(s):
"""
Returns the given string converted to a string that can be used for a clean
filename. Specifically, leading and trailing spaces are removed;
all non-filename-safe characters are removed.
"""
Returns the given string converted to a string that can be used for a clean
filename. Specifically, leading and trailing spaces are removed;
all non-filename-safe characters are removed.
>>> getValidFilename("john's portrait in 2004.jpg")
'john_s_portrait_in_2004.jpg'
"""
s = s.strip()
s = s.replace(' ', '_')
s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
s = s.replace('__', '_').replace('__', '_')
return s
>>> getValidFilename("john's portrait in 2004.jpg")
'john_s_portrait_in_2004.jpg'
"""
s = s.strip()
s = s.replace(' ', '_')
s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
s = s.replace('__', '_').replace('__', '_')
return s
def getTextList(list_, last_word='or'):
"""
>>> getTextList(['a', 'b', 'c', 'd'])
'a, b, c or d'
>>> getTextList(['a', 'b', 'c'], 'and')
'a, b and c'
>>> getTextList(['a', 'b'], 'and')
'a and b'
>>> getTextList(['a'])
'a'
>>> getTextList([])
''
"""
if len(list_) == 0: return ''
if len(list_) == 1: return list_[0]
return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1])
"""
>>> getTextList(['a', 'b', 'c', 'd'])
'a, b, c or d'
>>> getTextList(['a', 'b', 'c'], 'and')
'a, b and c'
>>> getTextList(['a', 'b'], 'and')
'a and b'
>>> getTextList(['a'])
'a'
>>> getTextList([])
''
"""
if len(list_) == 0: return ''
if len(list_) == 1: return list_[0]
return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1])
def getListText(text, last_word='or'):
"""
>>> getListText('a, b, c or d')
['a', 'b', 'c', 'd']
>>> getListText('a, b and c', 'and')
['a', 'b', 'c']
>>> getListText('a and b', 'and')
['a', 'b']
>>> getListText('a')
['a']
>>> getListText('')
[]
"""
list_ = []
if text:
list_ = text.split(', ')
if list_:
i=len(list_)-1
last = list_[i].split(last_word)
if len(last) == 2:
list_[i] = last[0].strip()
list_.append(last[1].strip())
return list_
"""
>>> getListText('a, b, c or d')
['a', 'b', 'c', 'd']
>>> getListText('a, b and c', 'and')
['a', 'b', 'c']
>>> getListText('a and b', 'and')
['a', 'b']
>>> getListText('a')
['a']
>>> getListText('')
[]
"""
list_ = []
if text:
list_ = text.split(', ')
if list_:
i=len(list_)-1
last = list_[i].split(last_word)
if len(last) == 2:
list_[i] = last[0].strip()
list_.append(last[1].strip())
return list_
def normalizeNewlines(text):
return re.sub(r'\r\n|\r|\n', '\n', text)
return re.sub(r'\r\n|\r|\n', '\n', text)
def recapitalize(text):
"Recapitalizes text, placing caps after end-of-sentence punctuation."
# capwords = ()
text = text.lower()
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
text = capsRE.sub(lambda x: x.group(1).upper(), text)
# for capword in capwords:
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
# text = capwordRE.sub(capword, text)
return text
"Recapitalizes text, placing caps after end-of-sentence punctuation."
#capwords = ()
text = text.lower()
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
text = capsRE.sub(lambda x: x.group(1).upper(), text)
#for capword in capwords:
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
# text = capwordRE.sub(capword, text)
return text
def phone2numeric(phone):
"Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I)
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'}.get(m.group(0).lower())
return letters.sub(char2number, phone)
"Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I)
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'}.get(m.group(0).lower())
return letters.sub(char2number, phone)
def compressString(s):
import cStringIO, gzip
zbuf = cStringIO.StringIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s)
zfile.close()
return zbuf.getvalue()
import cStringIO, gzip
zbuf = cStringIO.StringIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s)
zfile.close()
return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smartSplit(text):
"""
Generator that splits a string by spaces, leaving quoted phrases together.
Supports both single and double quotes, and supports escaping quotes with
backslashes. In the output, strings will keep their initial and trailing
quote marks.
>>> list(smartSplit('This is "a person\\'s" test.'))
['This', 'is', '"a person\\'s"', 'test.']
"""
for bit in smart_split_re.finditer(text):
bit = bit.group(0)
if bit[0] == '"':
yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
elif bit[0] == "'":
yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
else:
yield bit
"""
Generator that splits a string by spaces, leaving quoted phrases together.
Supports both single and double quotes, and supports escaping quotes with
backslashes. In the output, strings will keep their initial and trailing
quote marks.
>>> list(smartSplit('This is "a person\\'s" test.'))
['This', 'is', '"a person\\'s"', 'test.']
"""
for bit in smart_split_re.finditer(text):
bit = bit.group(0)
if bit[0] == '"':
yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
elif bit[0] == "'":
yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
else:
yield bit

View file

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# Written 2007 by j@mailb.org
from threading import Event
@ -11,50 +12,50 @@ from BitTornado.bencode import bencode, bdecode
def createTorrent(file, url, params = {}, flag = Event(),
progress = lambda x: None, progress_percent = 1):
"Creates a torrent for a given file, using url as tracker url"
return make_meta_file(file, url, params, flag, progress, progress_percent)
"Creates a torrent for a given file, using url as tracker url"
return make_meta_file(file, url, params, flag, progress, progress_percent)
def getInfoHash(torrentFile):
"Returns Torrent Info Hash from torrent file"
metainfo_file = open(torrentFile, 'rb')
metainfo = bdecode(metainfo_file.read())
info = metainfo['info']
return sha.sha(bencode(info)).hexdigest().upper()
"Returns Torrent Info Hash from torrent file"
metainfo_file = open(torrentFile, 'rb')
metainfo = bdecode(metainfo_file.read())
info = metainfo['info']
return sha.sha(bencode(info)).hexdigest().upper()
def getTorrentInfoFromFile(torrentFile):
f = open(torrentFile, 'rb')
data = f.read()
f.close()
tinfo = getTorrentInfo(data)
tinfo['timestamp'] = stat(torrentFile).st_ctime
return tinfo
f = open(torrentFile, 'rb')
data = f.read()
f.close()
tinfo = getTorrentInfo(data)
tinfo['timestamp'] = stat(torrentFile).st_ctime
return tinfo
def getTorrentInfo(data):
"Returns Torrent Info from torrent file"
tinfo = {}
metainfo = bdecode(data)
info = metainfo['info']
piece_length = info['piece length']
if info.has_key('length'):
# let's assume we just have one file
file_length = info['length']
else:
# let's assume we have a directory structure
file_length = 0;
for f in info['files']:
file_length += f['length']
for key in info:
if key != 'pieces':
tinfo[key] = info[key]
for key in metainfo:
if key != 'info':
tinfo[key] = metainfo[key]
tinfo['size'] = file_length
tinfo['hash'] = sha.sha(bencode(info)).hexdigest()
tinfo['announce'] = metainfo['announce']
return tinfo
"Returns Torrent Info from torrent file"
tinfo = {}
metainfo = bdecode(data)
info = metainfo['info']
piece_length = info['piece length']
if info.has_key('length'):
# let's assume we just have one file
file_length = info['length']
else:
# let's assume we have a directory structure
file_length = 0;
for f in info['files']:
file_length += f['length']
for key in info:
if key != 'pieces':
tinfo[key] = info[key]
for key in metainfo:
if key != 'info':
tinfo[key] = metainfo[key]
tinfo['size'] = file_length
tinfo['hash'] = sha.sha(bencode(info)).hexdigest()
tinfo['announce'] = metainfo['announce']
return tinfo
def getTorrentSize(torrentFile):
"Returns Size of files in torrent file in bytes"
return getTorrentInfo(torrentFile)['size']
"Returns Size of files in torrent file in bytes"
return getTorrentInfo(torrentFile)['size']

View file

@ -1,30 +1,30 @@
#!/usr/bin/env python
# vi:si:et:sw=2:sts=2:ts=2
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
from setuptools import setup, find_packages
setup(
name="oxutils",
version="0.1",
name="oxutils",
version="0.1",
description="collection of utils used to work with python",
author="0x",
author_email="code@0xdb.org",
url="http://code.0xdb.org/oxutils",
download_url="http://code.0xdb.org/oxutils/download",
license="GPLv3",
packages=find_packages(),
zip_safe=False,
install_requires=[
'chardet',
],
keywords = [
],
classifiers = [
'Development Status :: 3 - Alpha',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Topic :: Software Development :: Libraries :: Python Modules',
],
)
description="collection of utils used to work with python",
author="0x",
author_email="code@0xdb.org",
url="http://code.0xdb.org/oxutils",
download_url="http://code.0xdb.org/oxutils/download",
license="GPLv3",
packages=find_packages(),
zip_safe=False,
install_requires=[
'chardet',
],
keywords = [
],
classifiers = [
'Development Status :: 3 - Alpha',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Topic :: Software Development :: Libraries :: Python Modules',
],
)