install as ox

This commit is contained in:
j 2009-10-11 14:53:50 +02:00
commit d14f13faaf
12 changed files with 2 additions and 1 deletions

View file

@ -1,18 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
from file import *
from format import *
from html import *
from iso import *
from text import *
import cache
import net
#only works if BitTornado is installed
try:
from torrent import *
except:
pass

View file

@ -1,320 +0,0 @@
# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman
# see LICENSE.txt for license information
from types import IntType, LongType, StringType, ListType, TupleType, DictType
try:
from types import BooleanType
except ImportError:
BooleanType = None
try:
from types import UnicodeType
except ImportError:
UnicodeType = None
from cStringIO import StringIO
def decode_int(x, f):
f += 1
newf = x.index('e', f)
try:
n = int(x[f:newf])
except:
n = long(x[f:newf])
if x[f] == '-':
if x[f + 1] == '0':
raise ValueError
elif x[f] == '0' and newf != f+1:
raise ValueError
return (n, newf+1)
def decode_string(x, f):
colon = x.index(':', f)
try:
n = int(x[f:colon])
except (OverflowError, ValueError):
n = long(x[f:colon])
if x[f] == '0' and colon != f+1:
raise ValueError
colon += 1
return (x[colon:colon+n], colon+n)
def decode_unicode(x, f):
s, f = decode_string(x, f+1)
return (s.decode('UTF-8'),f)
def decode_list(x, f):
r, f = [], f+1
while x[f] != 'e':
v, f = decode_func[x[f]](x, f)
r.append(v)
return (r, f + 1)
def decode_dict(x, f):
r, f = {}, f+1
lastkey = None
while x[f] != 'e':
k, f = decode_string(x, f)
#why is this needed
#if lastkey >= k:
# raise ValueError
lastkey = k
r[k], f = decode_func[x[f]](x, f)
return (r, f + 1)
decode_func = {}
decode_func['l'] = decode_list
decode_func['d'] = decode_dict
decode_func['i'] = decode_int
decode_func['0'] = decode_string
decode_func['1'] = decode_string
decode_func['2'] = decode_string
decode_func['3'] = decode_string
decode_func['4'] = decode_string
decode_func['5'] = decode_string
decode_func['6'] = decode_string
decode_func['7'] = decode_string
decode_func['8'] = decode_string
decode_func['9'] = decode_string
#decode_func['u'] = decode_unicode
def bdecode(x, sloppy = 1):
try:
r, l = decode_func[x[0]](x, 0)
# except (IndexError, KeyError):
except (IndexError, KeyError, ValueError):
raise ValueError, "bad bencoded data"
if not sloppy and l != len(x):
raise ValueError, "bad bencoded data"
return r
def test_bdecode():
try:
bdecode('0:0:')
assert 0
except ValueError:
pass
try:
bdecode('ie')
assert 0
except ValueError:
pass
try:
bdecode('i341foo382e')
assert 0
except ValueError:
pass
assert bdecode('i4e') == 4L
assert bdecode('i0e') == 0L
assert bdecode('i123456789e') == 123456789L
assert bdecode('i-10e') == -10L
try:
bdecode('i-0e')
assert 0
except ValueError:
pass
try:
bdecode('i123')
assert 0
except ValueError:
pass
try:
bdecode('')
assert 0
except ValueError:
pass
try:
bdecode('i6easd')
assert 0
except ValueError:
pass
try:
bdecode('35208734823ljdahflajhdf')
assert 0
except ValueError:
pass
try:
bdecode('2:abfdjslhfld')
assert 0
except ValueError:
pass
assert bdecode('0:') == ''
assert bdecode('3:abc') == 'abc'
assert bdecode('10:1234567890') == '1234567890'
try:
bdecode('02:xy')
assert 0
except ValueError:
pass
try:
bdecode('l')
assert 0
except ValueError:
pass
assert bdecode('le') == []
try:
bdecode('leanfdldjfh')
assert 0
except ValueError:
pass
assert bdecode('l0:0:0:e') == ['', '', '']
try:
bdecode('relwjhrlewjh')
assert 0
except ValueError:
pass
assert bdecode('li1ei2ei3ee') == [1, 2, 3]
assert bdecode('l3:asd2:xye') == ['asd', 'xy']
assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]]
try:
bdecode('d')
assert 0
except ValueError:
pass
try:
bdecode('defoobar')
assert 0
except ValueError:
pass
assert bdecode('de') == {}
assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'}
assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}}
try:
bdecode('d3:fooe')
assert 0
except ValueError:
pass
try:
bdecode('di1e0:e')
assert 0
except ValueError:
pass
try:
bdecode('d1:b0:1:a0:e')
assert 0
except ValueError:
pass
try:
bdecode('d1:a0:1:a0:e')
assert 0
except ValueError:
pass
try:
bdecode('i03e')
assert 0
except ValueError:
pass
try:
bdecode('l01:ae')
assert 0
except ValueError:
pass
try:
bdecode('9999:x')
assert 0
except ValueError:
pass
try:
bdecode('l0:')
assert 0
except ValueError:
pass
try:
bdecode('d0:0:')
assert 0
except ValueError:
pass
try:
bdecode('d0:')
assert 0
except ValueError:
pass
bencached_marker = []
class Bencached:
def __init__(self, s):
self.marker = bencached_marker
self.bencoded = s
BencachedType = type(Bencached('')) # insufficient, but good as a filter
def encode_bencached(x,r):
assert x.marker == bencached_marker
r.append(x.bencoded)
def encode_int(x,r):
r.extend(('i',str(x),'e'))
def encode_bool(x,r):
encode_int(int(x),r)
def encode_string(x,r):
r.extend((str(len(x)),':',x))
def encode_unicode(x,r):
#r.append('u')
encode_string(x.encode('UTF-8'),r)
def encode_list(x,r):
r.append('l')
for e in x:
encode_func[type(e)](e, r)
r.append('e')
def encode_dict(x,r):
r.append('d')
ilist = x.items()
ilist.sort()
for k,v in ilist:
r.extend((str(len(k)),':',k))
encode_func[type(v)](v, r)
r.append('e')
encode_func = {}
encode_func[BencachedType] = encode_bencached
encode_func[IntType] = encode_int
encode_func[LongType] = encode_int
encode_func[StringType] = encode_string
encode_func[ListType] = encode_list
encode_func[TupleType] = encode_list
encode_func[DictType] = encode_dict
if BooleanType:
encode_func[BooleanType] = encode_bool
if UnicodeType:
encode_func[UnicodeType] = encode_unicode
def bencode(x):
r = []
try:
encode_func[type(x)](x, r)
except:
print "*** error *** could not encode type %s (value: %s)" % (type(x), x)
assert 0
return ''.join(r)
def test_bencode():
assert bencode(4) == 'i4e'
assert bencode(0) == 'i0e'
assert bencode(-10) == 'i-10e'
assert bencode(12345678901234567890L) == 'i12345678901234567890e'
assert bencode('') == '0:'
assert bencode('abc') == '3:abc'
assert bencode('1234567890') == '10:1234567890'
assert bencode([]) == 'le'
assert bencode([1, 2, 3]) == 'li1ei2ei3ee'
assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee'
assert bencode({}) == 'de'
assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee'
assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee'
try:
bencode({1: 'foo'})
assert 0
except AssertionError:
pass
try:
import psyco
psyco.bind(bdecode)
psyco.bind(bencode)
except ImportError:
pass

View file

@ -1,211 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import gzip
import hashlib
import os
import StringIO
import time
import urlparse
import urllib2
import sqlite3
import chardet
import simplejson
import net
from net import DEFAULT_HEADERS, getEncoding
cache_timeout = 30*24*60*60 # default is 30 days
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
'''
>>> status('http://google.com')
200
>>> status('http://google.com/mysearch')
404
'''
headers = getHeaders(url, data, headers)
return int(headers['status'])
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
'''
>>> exists('http://google.com')
True
>>> exists('http://google.com/mysearch')
False
'''
s = status(url, data, headers, timeout)
if s >= 200 and s < 400:
return True
return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
url_headers = _getUrlCache(url, data, headers, timeout, "headers")
if url_headers:
url_headers = simplejson.loads(url_headers)
else:
url_headers = net.getHeaders(url, data, headers)
_saveUrlCache(url, data, -1, url_headers)
return url_headers
class InvalidResult(Exception):
"""Base class for exceptions in this module."""
def __init__(self, result, headers):
self.result = result
self.headers = headers
def getUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None):
'''
url - url to load
data - possible post data
headers - headers to send with request
timeout - get from cache if cache not older than given seconds, -1 to get from cache
valid - function to check if result is ok, its passed result and headers
if this function fails, InvalidResult will be raised deal with it in your code
'''
#FIXME: send last-modified / etag from cache and only update if needed
if isinstance(url, unicode):
url = url.encode('utf-8')
result = _getUrlCache(url, data, headers, timeout)
if not result:
#print "get data", url
try:
url_headers, result = net.getUrl(url, data, headers, returnHeaders=True)
except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code
url_headers = dict(e.headers)
result = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
if not valid or valid(result, url_headers):
_saveUrlCache(url, data, result, url_headers)
else:
raise InvalidResult(result, url_headers)
return result
def getUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _getUrl=getUrl, valid=None):
data = _getUrl(url, data, headers, timeout, valid)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def _getCacheBase():
'cache base is eather ~/.ox/cache or can set via env variable oxCACHE'
return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))
def _getCacheDB():
return os.path.join(_getCacheBase(), "cache.sqlite")
def _connectDb():
conn = sqlite3.connect(_getCacheDB(), timeout=10)
conn.text_factory = str
return conn
def _createDb(c):
# Create table and indexes
c.execute('''CREATE TABLE IF NOT EXISTS cache (url_hash varchar(42) unique, domain text, url text,
post_data text, headers text, created int, data blob, only_headers int)''')
c.execute('''CREATE INDEX IF NOT EXISTS cache_domain ON cache (domain)''')
c.execute('''CREATE INDEX IF NOT EXISTS cache_url ON cache (url)''')
c.execute('''CREATE INDEX IF NOT EXISTS cache_url_hash ON cache (url_hash)''')
def _getUrlCache(url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
r = None
if timeout == 0:
return r
if data:
url_hash = hashlib.sha1(url + '?' + data).hexdigest()
else:
url_hash = hashlib.sha1(url).hexdigest()
conn = _connectDb()
c = conn.cursor()
_createDb(c)
sql = 'SELECT %s FROM cache WHERE url_hash=?' % value
if timeout > 0:
now = time.mktime(time.localtime())
t = (url_hash, now-timeout)
sql += ' AND created > ?'
else:
t = (url_hash, )
if value != "headers":
sql += ' AND only_headers != 1 '
c.execute(sql, t)
for row in c:
r = row[0]
if value == 'data':
r = str(r)
break
c.close()
conn.close()
return r
def _saveUrlCache(url, post_data, data, headers):
if post_data:
url_hash = hashlib.sha1(url + '?' + post_data).hexdigest()
else:
url_hash = hashlib.sha1(url).hexdigest()
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
conn = _connectDb()
c = conn.cursor()
# Create table if not exists
_createDb(c)
# Insert a row of data
if not post_data: post_data=""
only_headers = 0
if data == -1:
only_headers = 1
data = ""
created = time.mktime(time.localtime())
t = (url_hash, domain, url, post_data, simplejson.dumps(headers), created, sqlite3.Binary(data), only_headers)
c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?)""", t)
# Save (commit) the changes and clean up
conn.commit()
c.close()
conn.close()
def migrate_to_db():
import re
import os
import sqlite3
import glob
conn = _connectDb()
c = conn.cursor()
_createDb(c)
files = glob.glob(_getCacheBase() + "/*/*/*/*/*")
_files = filter(lambda x: not x.endswith(".headers"), files)
for f in _files:
info = re.compile("%s/(.*?)/../../../(.*)" % _getCacheBase()).findall(f)
domain = url = info[0][0]
url_hash = info[0][1]
post_data = ""
created = os.stat(f).st_ctime
fd = open(f, "r")
data = fd.read()
fd.close()
fd = open(f + ".headers", "r")
headers = fd.read()
fd.close()
t = (url_hash, domain, url, post_data, headers, created, sqlite3.Binary(data), 0)
c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?)""", t)
conn.commit()
c.close()
conn.close()

View file

@ -1,65 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
from __future__ import division
import os
import hashlib
import sys
import struct
import subprocess
import simplejson
def sha1sum(filename):
sha1 = hashlib.sha1()
file=open(filename)
buffer=file.read(4096)
while buffer:
sha1.update(buffer)
buffer=file.read(4096)
file.close()
return sha1.hexdigest()
'''
os hash - http://trac.opensubtitles.org/projects/opensubtitles/wiki/HashSourceCodes
plus modification for files < 64k, buffer is filled with file data and padded with 0
'''
def oshash(filename):
try:
longlongformat = 'q' # long long
bytesize = struct.calcsize(longlongformat)
f = open(filename, "rb")
filesize = os.path.getsize(filename)
hash = filesize
if filesize < 65536:
for x in range(int(filesize/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
else:
for x in range(int(65536/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
f.seek(max(0,filesize-65536),0)
for x in range(int(65536/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF
f.close()
returnedhash = "%016x" % hash
return returnedhash
except(IOError):
return "IOError"
def avinfo(filename):
p = subprocess.Popen(['ffmpeg2theora', '--info', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
info, error = p.communicate()
return simplejson.loads(info)

View file

@ -1,270 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import math
import re
def to36(q):
"""
Converts an integer to base 36 (a useful scheme for human-sayable IDs
like 'fuck' (739172), 'shit' (1329077) or 'hitler' (1059538851)).
>>> to36(35)
'z'
>>> to36(119292)
'2k1o'
>>> int(to36(939387374), 36)
939387374
>>> to36(0)
'0'
>>> to36(-393)
Traceback (most recent call last):
...
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError, "must supply a positive integer"
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
converted = []
while q != 0:
q, r = divmod(q, 36)
converted.insert(0, letters[r])
return "".join(converted) or '0'
def from36(q):
return int(q, 36)
def intValue(strValue, default=u''):
"""
>>> intValue('abc23')
u'23'
>>> intValue(' abc23')
u'23'
>>> intValue('ab')
u''
"""
try:
val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
except:
val = default
return val
def floatValue(strValue, default=u''):
"""
>>> floatValue('abc23.4')
u'23.4'
>>> floatValue(' abc23.4')
u'23.4'
>>> floatValue('ab')
u''
"""
try:
val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
except:
val = default
return val
def formatNumber(number, longName, shortName):
"""
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
>>> formatNumber(123, 'Byte', 'B')
'123 Bytes'
>>> formatNumber(1234, 'Byte', 'B')
'1 KB'
>>> formatNumber(1234567, 'Byte', 'B')
'1.2 MB'
>>> formatNumber(1234567890, 'Byte', 'B')
'1.15 GB'
>>> formatNumber(1234567890123456789, 'Byte', 'B')
'1,096.5166 PB'
>>> formatNumber(-1234567890123456789, 'Byte', 'B')
'-1,096.5166 PB'
"""
if abs(number) < 1024:
return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '')
prefix = ['K', 'M', 'G', 'T', 'P']
for i in range(5):
if abs(number) < math.pow(1024, i + 2) or i == 4:
n = number / math.pow(1024, i + 1)
return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName)
def formatThousands(number, separator = ','):
"""
Return the number with separators (1,000,000)
>>> formatThousands(1)
'1'
>>> formatThousands(1000)
'1,000'
>>> formatThousands(1000000)
'1,000,000'
"""
string = str(number).split('.')
l = []
for i, character in enumerate(reversed(string[0])):
if i and (not (i % 3)):
l.insert(0, separator)
l.insert(0, character)
string[0] = ''.join(l)
return '.'.join(string)
def formatBits(number):
return formatNumber(number, 'bit', 'b')
def formatBytes(number):
return formatNumber(number, 'byte', 'B')
def formatPixels(number):
return formatNumber(number, 'pixel', 'px')
def formatCurrency(amount, currency="$"):
if amount:
temp = "%.2f" % amount
profile=re.compile(r"(\d)(\d\d\d[.,])")
while 1:
temp, count = re.subn(profile,r"\1,\2",temp)
if not count:
break
if temp.startswith('-'):
return "-"+ currency + temp[1:-3]
return currency + temp[:-3]
else:
return ""
def plural(amount, unit, plural='s'):
'''
>>> plural(1, 'unit')
'1 unit'
>>> plural(2, 'unit')
'2 units'
'''
if abs(amount) != 1:
if plural == 's':
unit = unit + plural
else: unit = plural
return "%s %s" % (formatThousands(amount), unit)
def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
'''
verbosity
0: D:HH:MM:SS
1: Dd Hh Mm Ss
2: D days H hours M minutes S seconds
years
True: 366 days are 1 year 1 day
False: 366 days are 366 days
hours
True: 30 seconds are 00:00:30
False: 30 seconds are 00:30
milliseconds
True: always display milliseconds
False: never display milliseconds
>>> formatDuration(1000 * 60 * 60 * 24 * 366)
'1:001:00:00:00.000'
>>> formatDuration(1000 * 60 * 60 * 24 * 366, years=False)
'366:00:00:00.000'
>>> formatDuration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
'1 year 2 seconds 3 milliseconds'
>>> formatDuration(1000 * 30, hours=False, milliseconds=False)
'00:30'
'''
if not ms and ms != 0:
return ''
if years:
y = int(ms / 31536000000)
d = int(ms % 31536000000 / 86400000)
else:
d = int(ms / 86400000)
h = int(ms % 86400000 / 3600000)
m = int(ms % 3600000 / 60000)
s = int(ms % 60000 / 1000)
ms = ms % 1000
if verbosity == 0:
if years and y:
duration = "%d:%03d:%02d:%02d:%02d" % (y, d, h, m, s)
elif d:
duration = "%d:%02d:%02d:%02d" % (d, h, m, s)
elif hours or h:
duration = "%02d:%02d:%02d" % (h, m, s)
else:
duration = "%02d:%02d" % (m, s)
if milliseconds:
duration += ".%03d" % ms
else:
if verbosity == 1:
durations = ["%sd" % d, "%sh" % h, "%sm" % m, "%ss" % s]
if years:
durations.insert(0, "%sy" % y)
if milliseconds:
durations.append("%sms" % ms)
else:
durations = [plural(d, 'day'), plural(h,'hour'),
plural(m, 'minute'), plural(s, 'second')]
if years:
durations.insert(0, plural(y, 'year'))
if milliseconds:
durations.append(plural(ms, 'millisecond'))
durations = filter(lambda x: not x.startswith('0'), durations)
duration = ' '.join(durations)
return duration
def ms2runtime(ms, shortenLong=False):
# deprecated - use formatDuration
'''
>>> ms2runtime(5000)
'5 seconds'
>>> ms2runtime(500000)
'8 minutes 20 seconds'
>>> ms2runtime(50000000)
'13 hours 53 minutes 20 seconds'
>>> ms2runtime(50000000-20000)
'13 hours 53 minutes'
'''
if shortenLong and ms > 1000 * 60 * 60 * 24 * 464:
return formatDuration(ms, verbosity=1, milliseconds=False)
return formatDuration(ms, verbosity=2, milliseconds=False)
def ms2playtime(ms, hours=False):
# deprecated - use formatDuration
'''
>>> ms2playtime(5000)
'00:05'
>>> ms2playtime(500000)
'08:20'
>>> ms2playtime(50000000)
'13:53:20'
'''
return formatDuration(ms, hours=False, years=False, milliseconds=False)
def ms2time(ms):
# deprecated - use formatDuration
'''
>>> ms2time(44592123)
'12:23:12.123'
'''
return formatDuration(ms, years=False)
def time2ms(timeString):
'''
>>> time2ms('12:23:12.123')
44592123
'''
ms = 0.0
p = timeString.split(':')
for i in range(len(p)):
ms = ms * 60 + float(p[i])
return int(ms * 1000)
def shiftTime(offset, timeString):
newTime = time2ms(timeString) + offset
return ms2time(newTime)

View file

@ -1,172 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import re
import string
from htmlentitydefs import name2codepoint
# Configuration for urlize() function
LEADING_PUNCTUATION = ['(', '<', '&lt;']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;', "'", '"']
# list of possible strings used for bullets in bulleted lists
DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
word_split_re = re.compile(r'(\s+)')
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
del x # Temporary variable
def escape(html):
'''
Returns the given HTML with ampersands, quotes and carets encoded
>>> escape('html "test" & <brothers>')
'html &quot;test&quot; &amp; &lt;brothers&gt;'
'''
if not isinstance(html, basestring):
html = str(html)
return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')
def linebreaks(value):
'''
Converts newlines into <p> and <br />
'''
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
paras = re.split('\n{2,}', value)
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
return '\n\n'.join(paras)
def stripTags(value):
"""
Returns the given HTML with all tags stripped
>>> stripTags('some <h2>title</h2> <script>asdfasdf</script>')
'some title asdfasdf'
"""
return re.sub(r'<[^>]*?>', '', value)
def stripSpacesBetweenTags(value):
"Returns the given HTML with spaces between tags normalized to a single space"
return re.sub(r'>\s+<', '> <', value)
def stripEntities(value):
"Returns the given HTML with all entities (&something;) stripped"
return re.sub(r'&(?:\w+|#\d);', '', value)
def fixAmpersands(value):
"Returns the given HTML with all unencoded ampersands encoded correctly"
return unencoded_ampersands_re.sub('&amp;', value)
def urlize(text, trim_url_limit=None, nofollow=False):
"""
Converts any URLs in text into clickable links. Works on http://, https:// and
www. links. Links can have trailing punctuation (periods, commas, close-parens)
and leading punctuation (opening parens) and it'll still do the right thing.
If trim_url_limit is not None, the URLs in link text will be limited to
trim_url_limit characters.
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
"""
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or ''
for i, word in enumerate(words):
match = punctuation_re.match(word)
if match:
lead, middle, trail = match.groups()
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
len(middle) > 0 and middle[0] in string.letters + string.digits and \
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if middle.startswith('http://') or middle.startswith('https://'):
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
and simple_email_re.match(middle):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word:
words[i] = lead + middle + trail
return ''.join(words)
def cleanHtml(text):
"""
Cleans the given HTML. Specifically, it does the following:
* Converts <b> and <i> to <strong> and <em>.
* Encodes all ampersands correctly.
* Removes all "target" attributes from <a> tags.
* Removes extraneous HTML, such as presentational tags that open and
immediately close and <br clear="all">.
* Converts hard-coded bullets into HTML unordered lists.
* Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
bottom of the text.
"""
from text import normalizeNewlines
text = normalizeNewlines(text)
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
text = fixAmpersands(text)
# Remove all target="" attributes from <a> tags.
text = link_target_attribute_re.sub('\\1', text)
# Trim stupid HTML such as <br clear="all">.
text = html_gunk_re.sub('', text)
# Convert hard-coded bullets into HTML unordered lists.
def replace_p_tags(match):
s = match.group().replace('</p>', '</li>')
for d in DOTS:
s = s.replace('<p>%s' % d, '<li>')
return '<ul>\n%s\n</ul>' % s
text = hard_coded_bullets_re.sub(replace_p_tags, text)
# Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom of the text.
text = trailing_empty_content_re.sub('', text)
return text
# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def decodeHtml(html):
"""
>>> decodeHtml('me &amp; you and &#36;&#38;%')
u'me & you and $&%'
"""
if type(html) != unicode:
html = unicode(html)[:]
if type(html) is unicode:
uchr = unichr
else:
uchr = lambda value: value > 255 and unichr(value) or chr(value)
def entitydecode(match, uchr=uchr):
entity = match.group(1)
if entity.startswith('#x'):
return uchr(int(entity[2:], 16))
elif entity.startswith('#'):
return uchr(int(entity[1:]))
elif entity in name2codepoint:
return uchr(name2codepoint[entity])
else:
return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
def highlight(text, query, hlClass="hl"):
"""
>>> highlight('me &amp; you and &#36;&#38;%', 'and')
'me &amp; you <span class="hl">and</span> &#36;&#38;%'
"""
if query:
text = text.replace('<br />', '|')
query = re.escape(query).replace('\ ', '.')
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
for i in m:
text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
text = text.replace('|', '<br />')
return text

View file

@ -1,243 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
_iso639_languages = [
("Unknown", "", "", "und"),
("Afar", "", "aa", "aar"),
("Abkhazian", "", "ab", "abk"),
("Afrikaans", "", "af", "afr"),
("Akan", "", "ak", "aka"),
("Albanian", "", "sq", "sqi"),
("Amharic", "", "am", "amh"),
("Arabic", "", "ar", "ara"),
("Aragonese", "", "an", "arg"),
("Armenian", "", "hy", "hye"),
("Assamese", "", "as", "asm"),
("Avaric", "", "av", "ava"),
("Avestan", "", "ae", "ave"),
("Aymara", "", "ay", "aym"),
("Azerbaijani", "", "az", "aze"),
("Bashkir", "", "ba", "bak"),
("Bambara", "", "bm", "bam"),
("Basque", "", "eu", "eus"),
("Belarusian", "", "be", "bel"),
("Bengali", "", "bn", "ben"),
("Bihari", "", "bh", "bih"),
("Bislama", "", "bi", "bis"),
("Bosnian", "", "bs", "bos"),
("Breton", "", "br", "bre"),
("Bulgarian", "", "bg", "bul"),
("Burmese", "", "my", "mya"),
("Catalan", "", "ca", "cat"),
("Chamorro", "", "ch", "cha"),
("Chechen", "", "ce", "che"),
("Chinese", "", "zh", "zho"),
("Church Slavic", "", "cu", "chu"),
("Chuvash", "", "cv", "chv"),
("Cornish", "", "kw", "cor"),
("Corsican", "", "co", "cos"),
("Cree", "", "cr", "cre"),
("Czech", "", "cs", "ces"),
("Danish", "Dansk", "da", "dan"),
("Divehi", "", "dv", "div"),
("Dutch", "Nederlands", "nl", "nld"),
("Dzongkha", "", "dz", "dzo"),
("English", "English", "en", "eng"),
("Esperanto", "", "eo", "epo"),
("Estonian", "", "et", "est"),
("Ewe", "", "ee", "ewe"),
("Faroese", "", "fo", "fao"),
("Fijian", "", "fj", "fij"),
("Finnish", "Suomi", "fi", "fin"),
("French", "Francais", "fr", "fra"),
("Western Frisian", "", "fy", "fry"),
("Fulah", "", "ff", "ful"),
("Georgian", "", "ka", "kat"),
("German", "Deutsch", "de", "deu"),
("Gaelic (Scots)", "", "gd", "gla"),
("Irish", "", "ga", "gle"),
("Galician", "", "gl", "glg"),
("Manx", "", "gv", "glv"),
("Greek, Modern", "", "el", "ell"),
("Guarani", "", "gn", "grn"),
("Gujarati", "", "gu", "guj"),
("Haitian", "", "ht", "hat"),
("Hausa", "", "ha", "hau"),
("Hebrew", "", "he", "heb"),
("Herero", "", "hz", "her"),
("Hindi", "", "hi", "hin"),
("Hiri Motu", "", "ho", "hmo"),
("Hungarian", "Magyar", "hu", "hun"),
("Igbo", "", "ig", "ibo"),
("Icelandic", "Islenska", "is", "isl"),
("Ido", "", "io", "ido"),
("Sichuan Yi", "", "ii", "iii"),
("Inuktitut", "", "iu", "iku"),
("Interlingue", "", "ie", "ile"),
("Interlingua", "", "ia", "ina"),
("Indonesian", "", "id", "ind"),
("Inupiaq", "", "ik", "ipk"),
("Italian", "Italiano", "it", "ita"),
("Javanese", "", "jv", "jav"),
("Japanese", "", "ja", "jpn"),
("Kalaallisut (Greenlandic)", "", "kl", "kal"),
("Kannada", "", "kn", "kan"),
("Kashmiri", "", "ks", "kas"),
("Kanuri", "", "kr", "kau"),
("Kazakh", "", "kk", "kaz"),
("Central Khmer", "", "km", "khm"),
("Kikuyu", "", "ki", "kik"),
("Kinyarwanda", "", "rw", "kin"),
("Kirghiz", "", "ky", "kir"),
("Komi", "", "kv", "kom"),
("Kongo", "", "kg", "kon"),
("Korean", "", "ko", "kor"),
("Kuanyama", "", "kj", "kua"),
("Kurdish", "", "ku", "kur"),
("Lao", "", "lo", "lao"),
("Latin", "", "la", "lat"),
("Latvian", "", "lv", "lav"),
("Limburgan", "", "li", "lim"),
("Lingala", "", "ln", "lin"),
("Lithuanian", "", "lt", "lit"),
("Luxembourgish", "", "lb", "ltz"),
("Luba-Katanga", "", "lu", "lub"),
("Ganda", "", "lg", "lug"),
("Macedonian", "", "mk", "mkd"),
("Marshallese", "", "mh", "mah"),
("Malayalam", "", "ml", "mal"),
("Maori", "", "mi", "mri"),
("Marathi", "", "mr", "mar"),
("Malay", "", "ms", "msa"),
("Malagasy", "", "mg", "mlg"),
("Maltese", "", "mt", "mlt"),
("Moldavian", "", "mo", "mol"),
("Mongolian", "", "mn", "mon"),
("Nauru", "", "na", "nau"),
("Navajo", "", "nv", "nav"),
("Ndebele, South", "", "nr", "nbl"),
("Ndebele, North", "", "nd", "nde"),
("Ndonga", "", "ng", "ndo"),
("Nepali", "", "ne", "nep"),
("Norwegian Nynorsk", "", "nn", "nno"),
("Norwegian Bokmål", "", "nb", "nob"),
("Norwegian", "Norsk", "no", "nor"),
("Chichewa; Nyanja", "", "ny", "nya"),
("Occitan (post 1500); Provençal", "", "oc", "oci"),
("Ojibwa", "", "oj", "oji"),
("Oriya", "", "or", "ori"),
("Oromo", "", "om", "orm"),
("Ossetian; Ossetic", "", "os", "oss"),
("Panjabi", "", "pa", "pan"),
("Persian", "", "fa", "fas"),
("Pali", "", "pi", "pli"),
("Polish", "", "pl", "pol"),
("Portuguese", "Portugues", "pt", "por"),
("Pushto", "", "ps", "pus"),
("Quechua", "", "qu", "que"),
("Romansh", "", "rm", "roh"),
("Romanian", "", "ro", "ron"),
("Rundi", "", "rn", "run"),
("Russian", "", "ru", "rus"),
("Sango", "", "sg", "sag"),
("Sanskrit", "", "sa", "san"),
("Serbian", "", "sr", "srp"),
("Croatian", "Hrvatski", "hr", "hrv"),
("Sinhala", "", "si", "sin"),
("Slovak", "", "sk", "slk"),
("Slovenian", "", "sl", "slv"),
("Northern Sami", "", "se", "sme"),
("Samoan", "", "sm", "smo"),
("Shona", "", "sn", "sna"),
("Sindhi", "", "sd", "snd"),
("Somali", "", "so", "som"),
("Sotho, Southern", "", "st", "sot"),
("Spanish", "Espanol", "es", "spa"),
("Sardinian", "", "sc", "srd"),
("Swati", "", "ss", "ssw"),
("Sundanese", "", "su", "sun"),
("Swahili", "", "sw", "swa"),
("Swedish", "Svenska", "sv", "swe"),
("Tahitian", "", "ty", "tah"),
("Tamil", "", "ta", "tam"),
("Tatar", "", "tt", "tat"),
("Telugu", "", "te", "tel"),
("Tajik", "", "tg", "tgk"),
("Tagalog", "", "tl", "tgl"),
("Thai", "", "th", "tha"),
("Tibetan", "", "bo", "bod"),
("Tigrinya", "", "ti", "tir"),
("Tonga (Tonga Islands)", "", "to", "ton"),
("Tswana", "", "tn", "tsn"),
("Tsonga", "", "ts", "tso"),
("Turkmen", "", "tk", "tuk"),
("Turkish", "", "tr", "tur"),
("Twi", "", "tw", "twi"),
("Uighur", "", "ug", "uig"),
("Ukrainian", "", "uk", "ukr"),
("Urdu", "", "ur", "urd"),
("Uzbek", "", "uz", "uzb"),
("Venda", "", "ve", "ven"),
("Vietnamese", "", "vi", "vie"),
("Volapük", "", "vo", "vol"),
("Welsh", "", "cy", "cym"),
("Walloon", "", "wa", "wln"),
("Wolof", "", "wo", "wol"),
("Xhosa", "", "xh", "xho"),
("Yiddish", "", "yi", "yid"),
("Yoruba", "", "yo", "yor"),
("Zhuang", "", "za", "zha"),
("Zulu", "", "zu", "zul"),
]
def codeToLang(code):
code = code.lower()
if len(code) == 2:
for l in _iso639_languages:
if l[2] == code:
return l[0]
elif len(code) == 3:
for l in _iso639_languages:
if l[3] == code:
return l[0]
return None
def langTo3Code(lang):
lang = langEnglishName(lang)
if lang:
lang=lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[3]
return None
def langTo2Code(lang):
lang = langEnglishName(lang)
if lang:
lang=lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[2]
return None
def langCode2To3(code):
langTo3Code(codeToLang(code))
def langCode3To2(code):
langTo2Code(codeToLang(code))
def langEnglishName(lang):
lang = lang.lower()
for l in _iso639_languages:
if l[1].lower() == lang:
return l[0]
return None
def languages2Letter():
languages = []
for l in _iso639_languages:
if l[2]:
languages.append(l[2])
return languages

View file

@ -1,89 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import os
import gzip
import StringIO
import urllib
import urllib2
from chardet.universaldetector import UniversalDetector
# Default headers for HTTP requests.
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; U; Linux i386; en-US; rv:1.9.1.1) Gecko/20090716 Firefox/3.5',
'Accept-Encoding': 'gzip'
}
def status(url, data=None, headers=DEFAULT_HEADERS):
try:
f = openUrl(url, data, headers)
s = f.code
except urllib2.HTTPError, e:
s = e.code
return s
def exists(url, data=None, headers=DEFAULT_HEADERS):
s = status(url, data, headers)
if s >= 200 and s < 400:
return True
return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
try:
f = openUrl(url, data, headers)
f.headers['Status'] = "%s" % f.code
headers = f.headers
f.close()
except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code
headers = e.headers
return dict(headers)
def openUrl(url, data=None, headers=DEFAULT_HEADERS):
url = url.replace(' ', '%20')
req = urllib2.Request(url, data, headers)
return urllib2.urlopen(req)
def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
f = openUrl(url, data, headers)
data = f.read()
f.close()
if f.headers.get('content-encoding', None) == 'gzip':
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
if returnHeaders:
f.headers['Status'] = "%s" % f.code
return dict(f.headers), data
return data
def getUrlUnicode(url):
data = getUrl(url)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def getEncoding(data):
if 'content="text/html; charset=utf-8"' in data:
return 'utf-8'
elif 'content="text/html; charset=iso-8859-1"' in data:
return 'iso-8859-1'
detector = UniversalDetector()
for line in data.split('\n'):
detector.feed(line)
if detector.done:
break
detector.close()
return detector.result['encoding']
def saveUrl(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
data = getUrl(url)
f = open(filename, 'w')
f.write(data)
f.close()

View file

@ -1,161 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import re
_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
u'\xd4\xef', u'\xcf\xe9')
# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
if article[-1] not in ("'", '-'): article += ' '
_spArticles.append(article)
def canonicalTitle(title):
"""Return the title in the canonic format 'Movie Title, The'.
>>> canonicalTitle('The Movie Title')
'Movie Title, The'
"""
try:
if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
except IndexError: pass
ltitle = title.lower()
for article in _spArticles:
if ltitle.startswith(article):
lart = len(article)
title = '%s, %s' % (title[lart:], title[:lart])
if article[-1] == ' ': title = title[:-1]
break
## XXX: an attempt using a dictionary lookup.
##for artSeparator in (' ', "'", '-'):
## article = _articlesDict.get(ltitle.split(artSeparator)[0])
## if article is not None:
## lart = len(article)
## # check titles like "una", "I'm Mad" and "L'abbacchio".
## if title[lart:] == '' or (artSeparator != ' ' and
## title[lart:][1] != artSeparator): continue
## title = '%s, %s' % (title[lart:], title[:lart])
## if artSeparator == ' ': title = title[1:]
## break
return title
def normalizeTitle(title):
"""Return the title in the normal "The Title" format.
>>> normalizeTitle('Movie Title, The')
'The Movie Title'
"""
stitle = title.split(', ')
if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
sep = ' '
if stitle[-1][-1] in ("'", '-'): sep = ''
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title
def normalizeImdbId(imdbId):
"""Return 7 digit imdbId.
>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
'0159206'
>>> normalizeImdbId(159206)
'0159206'
>>> normalizeImdbId('tt0159206')
'0159206'
"""
if isinstance(imdbId, basestring):
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
elif isinstance(imdbId, int):
imdbId = "%07d" % imdbId
return imdbId
# Common suffixes in surnames.
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')
def canonicalName(name):
"""Return the given name in canonical "Surname, Name" format.
It assumes that name is in the 'Name Surname' format.
>>> canonicalName('Jean Luc Godard')
'Godard, Jean Luc'
>>> canonicalName('Ivan Ivanov-Vano')
'Ivanov-Vano, Ivan'
>>> canonicalName('Gus Van Sant')
'Van Sant, Gus'
>>> canonicalName('Brian De Palma')
'De Palma, Brian'
"""
# XXX: some statistics (over 1852406 names):
# - just a surname: 51921
# - single surname, single name: 1792759
# - composed surname, composed name: 7726
# - composed surname, single name: 55623
# (2: 49259, 3: 5502, 4: 551)
# - single surname, composed name: 186604
# (2: 178315, 3: 6573, 4: 1219, 5: 352)
# Don't convert names already in the canonical format.
if name.find(', ') != -1: return name
sname = name.split(' ')
snl = len(sname)
if snl == 2:
# Just a name and a surname: how boring...
name = '%s, %s' % (sname[1], sname[0])
elif snl > 2:
lsname = [x.lower() for x in sname]
if snl == 3: _indexes = (0, snl-2)
else: _indexes = (0, snl-2, snl-3)
# Check for common surname prefixes at the beginning and near the end.
for index in _indexes:
if lsname[index] not in _sname_suffixes: continue
try:
# Build the surname.
surn = '%s %s' % (sname[index], sname[index+1])
del sname[index]
del sname[index]
try:
# Handle the "Jr." after the name.
if lsname[index+2].startswith('jr'):
surn += ' %s' % sname[index]
del sname[index]
except (IndexError, ValueError):
pass
name = '%s, %s' % (surn, ' '.join(sname))
break
except ValueError:
continue
else:
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
return name
def normalizeName(name):
"""Return a name in the normal "Name Surname" format.
>>> normalizeName('Godard, Jean Luc')
'Jean Luc Godard'
>>> normalizeName('Ivanov-Vano, Ivan')
'Ivan Ivanov-Vano'
>>> normalizeName('Van Sant, Gus')
'Gus Van Sant'
>>> normalizeName('De Palma, Brian')
'Brian De Palma'
"""
sname = name.split(', ')
if len(sname) == 2:
name = '%s %s' % (sname[1], sname[0])
return name

View file

@ -1,270 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import math
import re
def findRe(string, regexp):
result = re.compile(regexp, re.DOTALL).findall(string)
if result:
return result[0].strip()
return ''
def findString(string, string0='', string1 = ''):
"""Return the string between string0 and string1.
If string0 or string1 is left out, begining or end of string is used.
>>> findString('i am not there', string1=' not there')
'i am'
>>> findString('i am not there', 'i am ', ' there')
'not'
>>> findString('i am not there', 'i am not t')
'here'
"""
if string0:
string0 = re.escape(string0)
else:
string0 = '^'
if string1:
string1 = re.escape(string1)
else:
string1 = '$'
return findRe(string, string0 + '(.*?)' + string1)
def removeSpecialCharacters(text):
"""
Removes special characters inserted by Word.
"""
text = text.replace(u'\u2013', '-')
text = text.replace(u'\u2026O', "'")
text = text.replace(u'\u2019', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "-")
return text
def wrap(text, width):
"""
A word-wrap function that preserves existing line breaks and most spaces in
the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
"""
return reduce(lambda line, word, width=width: '%s%s%s' %
(line,
' \n'[(len(line[line.rfind('\n')+1:])
+ len(word.split('\n',1)[0]
) >= width)],
word),
text.split(' ')
)
def wrapString(string, length=80, separator='\n', balance=False):
'''
>>> wrapString(u"Anticonstitutionellement, Paris s'eveille", 16)
u"Anticonstitution\\nellement, Paris \\ns'eveille"
>>> wrapString(u'All you can eat', 12, '\\n', True)
u'All you \\ncan eat'
'''
words = string.split(' ')
if balance:
# balance lines: test if same number of lines
# can be achieved with a shorter line length
lines = wrapString(string, length, separator, False).split(separator)
if len(lines) > 1:
while length > max(map(lambda x : len(x), words)):
length -= 1
if len(wrapString(string, length, separator, False).split(separator)) > len(lines):
length += 1
break
lines = ['']
for word in words:
if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
# word fits in current line
lines[len(lines) - 1] += word + u' ';
else:
if len(word) <= length:
# word fits in next line
lines.append(word + u' ')
else:
# word is longer than line
position = length - len(lines[len(lines) - 1])
lines[len(lines) - 1] += word[0:position]
for i in range(position, len(word), length):
lines.append(word[i:i+length]);
lines[len(lines) - 1] += u' '
return separator.join(lines).strip()
def truncateString(string, length, padding='...', position='right'):
# >>> truncateString('anticonstitutionellement', 16, '...', 'left')
# '...utionellement'
# >>> truncateString('anticonstitutionellement', 16, '...', 'center')
# 'anticon...lement'
# >>> truncateString('anticonstitutionellement', 16, '...', 'right')
# 'anticonstitut...'
stringLength = len(string);
paddingLength = len(padding)
if stringLength > length:
if position == 'left':
string = '%s%s' % (padding, string[stringLength + paddingLength - length:])
elif position == 'center':
left = int(math.ceil(float(length - paddingLength) / 2))
right = int(stringLength - math.floor(float(length - paddingLength) / 2))
string = '%s%s%s' % (string[:left], padding, string[right:])
elif position == 'right':
string = '%s%s' % (string[:length - paddingLength], padding)
return string;
def truncateWords(s, num):
"""Truncates a string after a certain number of chacters, but ends with a word
>>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23)
'Truncates a string...'
>>> truncateString('Truncates a string', 23)
'Truncates a string'
"""
length = int(num)
if len(s) <= length:
return s
words = s.split()
ts = ""
while words and len(ts) + len(words[0]) < length:
ts += " " + words.pop(0)
if words:
ts += "..."
return ts.strip()
def trimString(string, num):
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
>>> trimString('Truncates a string after a certain number of chacters', 23)
'Truncates ...f chacters'
>>> trimString('Truncates a string', 23)
'Truncates a string'
"""
if len(string) > num:
string = string[:num - 13] + '...' + string[-10:]
return string
def truncateWords(s, num):
"Truncates a string after a certain number of words."
length = int(num)
words = s.split()
if len(words) > length:
words = words[:length]
if not words[-1].endswith('...'):
words.append('...')
return ' '.join(words)
def getValidFilename(s):
"""
Returns the given string converted to a string that can be used for a clean
filename. Specifically, leading and trailing spaces are removed;
all non-filename-safe characters are removed.
>>> getValidFilename("john's portrait in 2004.jpg")
'john_s_portrait_in_2004.jpg'
"""
s = s.strip()
s = s.replace(' ', '_')
s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
s = s.replace('__', '_').replace('__', '_')
return s
def getTextList(list_, last_word='or'):
"""
>>> getTextList([u'a', u'b', u'c', u'd'])
u'a, b, c or d'
>>> getTextList([u'a', u'b', u'c'], 'and')
u'a, b and c'
>>> getTextList([u'a', u'b'], 'and')
u'a and b'
>>> getTextList([u'a'])
u'a'
>>> getTextList([])
''
"""
if len(list_) == 0: return ''
if len(list_) == 1: return list_[0]
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
def getListText(text, last_word='or'):
"""
>>> getListText(u'a, b, c or d')
[u'a', u'b', u'c', u'd']
>>> getListText(u'a, b and c', u'and')
[u'a', u'b', u'c']
>>> getListText(u'a and b', u'and')
[u'a', u'b']
>>> getListText(u'a')
[u'a']
>>> getListText(u'')
[]
"""
list_ = []
if text:
list_ = text.split(u', ')
if list_:
i=len(list_)-1
last = list_[i].split(last_word)
if len(last) == 2:
list_[i] = last[0].strip()
list_.append(last[1].strip())
return list_
def normalizeNewlines(text):
return re.sub(r'\r\n|\r|\n', '\n', text)
def recapitalize(text):
"Recapitalizes text, placing caps after end-of-sentence punctuation."
#capwords = ()
text = text.lower()
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
text = capsRE.sub(lambda x: x.group(1).upper(), text)
#for capword in capwords:
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
# text = capwordRE.sub(capword, text)
return text
def phone2numeric(phone):
"Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I)
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'}.get(m.group(0).lower())
return letters.sub(char2number, phone)
def compressString(s):
import cStringIO, gzip
zbuf = cStringIO.StringIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s)
zfile.close()
return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smartSplit(text):
"""
Generator that splits a string by spaces, leaving quoted phrases together.
Supports both single and double quotes, and supports escaping quotes with
backslashes. In the output, strings will keep their initial and trailing
quote marks.
>>> list(smartSplit('This is "a person\\'s" test.'))
['This', 'is', '"a person\\'s"', 'test.']
"""
for bit in smart_split_re.finditer(text):
bit = bit.group(0)
if bit[0] == '"':
yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
elif bit[0] == "'":
yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
else:
yield bit

View file

@ -1,74 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2007
from threading import Event
import hashlib
from os import stat
import os
from BitTornado.BT1.makemetafile import make_meta_file
from bencode import bencode, bdecode
def createTorrent(file, url, params = {}, flag = Event(),
progress = lambda x: None, progress_percent = 1):
"Creates a torrent for a given file, using url as tracker url"
return make_meta_file(file, url, params, flag, progress, progress_percent)
def getInfoHash(torrentFile):
"Returns Torrent Info Hash from torrent file"
metainfo_file = open(torrentFile, 'rb')
metainfo = bdecode(metainfo_file.read())
info = metainfo['info']
return hashlib.sha1(bencode(info)).hexdigest().upper()
def getTorrentInfoFromFile(torrentFile):
f = open(torrentFile, 'rb')
data = f.read()
f.close()
tinfo = getTorrentInfo(data)
tinfo['timestamp'] = stat(torrentFile).st_ctime
return tinfo
def getTorrentInfo(data):
"Returns Torrent Info from torrent file"
tinfo = {}
metainfo = bdecode(data)
info = metainfo['info']
piece_length = info['piece length']
if info.has_key('length'):
# let's assume we just have one file
file_length = info['length']
else:
# let's assume we have a directory structure
file_length = 0;
for f in info['files']:
file_length += f['length']
for key in info:
if key != 'pieces':
tinfo[key] = info[key]
for key in metainfo:
if key != 'info':
tinfo[key] = metainfo[key]
tinfo['size'] = file_length
tinfo['hash'] = hashlib.sha1(bencode(info)).hexdigest()
tinfo['announce'] = metainfo['announce']
return tinfo
def getFiles(data):
files = []
info = getTorrentInfo(data)
if 'files' in info:
for f in info['files']:
path = [info['name'], ]
path.extend(f['path'])
files.append(os.path.join(*path))
else:
files.append(info['name'])
return files
def getTorrentSize(torrentFile):
"Returns Size of files in torrent file in bytes"
return getTorrentInfo(torrentFile)['size']