rename, use namespaces

This commit is contained in:
j 2010-07-08 00:34:04 +02:00
commit 0d354d2574
15 changed files with 7 additions and 7 deletions

17
ox/__init__.py Normal file
View file

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
__version__ = '1.0.0'
from file import *
from format import *
from html import *
from iso import *
from text import *
from form import *
import cache
import net
from torrent import *

224
ox/cache.py Normal file
View file

@ -0,0 +1,224 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import gzip
import hashlib
import os
import StringIO
import time
import urlparse
import urllib2
import sqlite3
import chardet
import simplejson
import net
from net import DEFAULT_HEADERS, getEncoding
cache_timeout = 30*24*60*60 # default is 30 days
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
'''
>>> status('http://google.com')
200
>>> status('http://google.com/mysearch')
404
'''
headers = getHeaders(url, data, headers)
return int(headers['status'])
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
'''
>>> exists('http://google.com')
True
>>> exists('http://google.com/mysearch')
False
'''
s = status(url, data, headers, timeout)
if s >= 200 and s < 400:
return True
return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
url_headers = _readUrlCache(url, data, headers, timeout, "headers")
if url_headers:
url_headers = simplejson.loads(url_headers)
else:
url_headers = net.getHeaders(url, data, headers)
_saveUrlCache(url, data, -1, url_headers)
return url_headers
class InvalidResult(Exception):
"""Base class for exceptions in this module."""
def __init__(self, result, headers):
self.result = result
self.headers = headers
def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None):
'''
url - url to load
data - possible post data
headers - headers to send with request
timeout - get from cache if cache not older than given seconds, -1 to get from cache
valid - function to check if result is ok, its passed result and headers
if this function fails, InvalidResult will be raised deal with it in your code
'''
#FIXME: send last-modified / etag from cache and only update if needed
if isinstance(url, unicode):
url = url.encode('utf-8')
result = _readUrlCache(url, data, headers, timeout)
if not result:
#print "get data", url
try:
url_headers, result = net.readUrl(url, data, headers, returnHeaders=True)
except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code
url_headers = dict(e.headers)
result = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
if not valid or valid(result, url_headers):
_saveUrlCache(url, data, result, url_headers)
else:
raise InvalidResult(result, url_headers)
return result
def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _readUrl=readUrl, valid=None):
data = _readUrl(url, data, headers, timeout, valid)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def saveUrl(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
data = readUrl(url)
f = open(filename, 'w')
f.write(data)
f.close()
def _getCacheBase():
'cache base is eather ~/.ox/cache or can set via env variable oxCACHE'
return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))
def _getCacheDB():
path = _getCacheBase()
if not os.path.exists(path):
os.makedirs(path)
return os.path.join(path, "cache.sqlite")
def _connectDb():
conn = sqlite3.connect(_getCacheDB(), timeout=10)
conn.text_factory = str
return conn
def _createDb(c):
# Create table and indexes
c.execute('''CREATE TABLE IF NOT EXISTS cache (url_hash varchar(42) unique, domain text, url text,
post_data text, headers text, created int, data blob, only_headers int)''')
c.execute('''CREATE INDEX IF NOT EXISTS cache_domain ON cache (domain)''')
c.execute('''CREATE INDEX IF NOT EXISTS cache_url ON cache (url)''')
c.execute('''CREATE INDEX IF NOT EXISTS cache_url_hash ON cache (url_hash)''')
def _readUrlCache(url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
r = None
if timeout == 0:
return r
if data:
url_hash = hashlib.sha1(url + '?' + data).hexdigest()
else:
url_hash = hashlib.sha1(url).hexdigest()
conn = _connectDb()
c = conn.cursor()
_createDb(c)
sql = 'SELECT %s FROM cache WHERE url_hash=?' % value
if timeout > 0:
now = time.mktime(time.localtime())
t = (url_hash, now-timeout)
sql += ' AND created > ?'
else:
t = (url_hash, )
if value != "headers":
sql += ' AND only_headers != 1 '
c.execute(sql, t)
for row in c:
r = row[0]
if value == 'data':
r = str(r)
break
c.close()
conn.close()
return r
def _saveUrlCache(url, post_data, data, headers):
if post_data:
url_hash = hashlib.sha1(url + '?' + post_data).hexdigest()
else:
url_hash = hashlib.sha1(url).hexdigest()
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
conn = _connectDb()
c = conn.cursor()
# Create table if not exists
_createDb(c)
# Insert a row of data
if not post_data: post_data=""
only_headers = 0
if data == -1:
only_headers = 1
data = ""
created = time.mktime(time.localtime())
t = (url_hash, domain, url, post_data, simplejson.dumps(headers), created, sqlite3.Binary(data), only_headers)
c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?)""", t)
# Save (commit) the changes and clean up
conn.commit()
c.close()
conn.close()
def migrate_to_db():
import re
import os
import sqlite3
import glob
conn = _connectDb()
c = conn.cursor()
_createDb(c)
files = glob.glob(_getCacheBase() + "/*/*/*/*/*")
_files = filter(lambda x: not x.endswith(".headers"), files)
for f in _files:
info = re.compile("%s/(.*?)/../../../(.*)" % _getCacheBase()).findall(f)
domain = url = info[0][0]
url_hash = info[0][1]
post_data = ""
created = os.stat(f).st_ctime
fd = open(f, "r")
data = fd.read()
fd.close()
fd = open(f + ".headers", "r")
headers = fd.read()
fd.close()
t = (url_hash, domain, url, post_data, headers, created, sqlite3.Binary(data), 0)
c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?)""", t)
conn.commit()
c.close()
conn.close()

67
ox/file.py Normal file
View file

@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
from __future__ import division
import os
import hashlib
import sys
import struct
import subprocess
import simplejson
__all__ = ['sha1sum', 'oshash', 'avinfo']
def sha1sum(filename):
sha1 = hashlib.sha1()
file=open(filename)
buffer=file.read(4096)
while buffer:
sha1.update(buffer)
buffer=file.read(4096)
file.close()
return sha1.hexdigest()
'''
os hash - http://trac.opensubtitles.org/projects/opensubtitles/wiki/HashSourceCodes
plus modification for files < 64k, buffer is filled with file data and padded with 0
'''
def oshash(filename):
try:
longlongformat = 'q' # long long
bytesize = struct.calcsize(longlongformat)
f = open(filename, "rb")
filesize = os.path.getsize(filename)
hash = filesize
if filesize < 65536:
for x in range(int(filesize/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
else:
for x in range(int(65536/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
f.seek(max(0,filesize-65536),0)
for x in range(int(65536/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF
f.close()
returnedhash = "%016x" % hash
return returnedhash
except(IOError):
return "IOError"
def avinfo(filename):
if os.path.getsize(filename):
p = subprocess.Popen(['ffmpeg2theora', '--info', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
info, error = p.communicate()
return simplejson.loads(info)
return {'path': filename, 'size': 0}

73
ox/form.py Normal file
View file

@ -0,0 +1,73 @@
import itertools
import mimetools
import mimetypes
__all__ = ['MultiPartForm']
class MultiPartForm(object):
"""Accumulate the data to be used when posting a form."""
def __init__(self):
self.form_fields = []
self.files = []
self.boundary = mimetools.choose_boundary()
return
def get_content_type(self):
return 'multipart/form-data; boundary=%s' % self.boundary
def add_field(self, name, value):
"""Add a simple field to the form data."""
self.form_fields.append((name, value))
return
def add_file(self, fieldname, filename, fileHandle, mimetype=None):
"""Add a file to be uploaded."""
if hasattr(fileHandle, 'read'):
body = fileHandle.read()
else:
body = fileHandle
if mimetype is None:
mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
self.files.append((fieldname, filename, mimetype, body))
return
def __str__(self):
"""Return a string representing the form data, including attached files."""
# Build a list of lists, each containing "lines" of the
# request. Each part is separated by a boundary string.
# Once the list is built, return a string where each
# line is separated by '\r\n'.
parts = []
part_boundary = '--' + self.boundary
# Add the form fields
parts.extend(
[ part_boundary,
'Content-Disposition: form-data; name="%s"' % name,
'',
value,
]
for name, value in self.form_fields
)
# Add the files to upload
parts.extend(
[ part_boundary,
'Content-Disposition: file; name="%s"; filename="%s"' % \
(field_name, filename),
'Content-Type: %s' % content_type,
'',
body,
]
for field_name, filename, content_type, body in self.files
)
# Flatten the list and add closing boundary marker,
# then return CR+LF separated data
flattened = list(itertools.chain(*parts))
flattened.append('--' + self.boundary + '--')
flattened.append('')
return '\r\n'.join(flattened)

350
ox/format.py Normal file
View file

@ -0,0 +1,350 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import math
import re
def to32(q):
"""
Converts an integer to base 32
We exclude 4 of the 26 letters: I L O U.
http://www.crockford.com/wrmg/base32.html
>>> to32(35)
'13'
>>> to32(119292)
'3MgV'
>>> to32(939387374)
'wZwTgD'
>>> to32(0)
'0'
>>> to32(-393)
Traceback (most recent call last):
...
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError, "must supply a positive integer"
letters = "0123456789ACBEDGFHKJMNQPSRTWVYXZ"
converted = []
upper = True
while q != 0:
q, r = divmod(q, 32)
l = letters[r]
if upper:
upper = False
else:
l = l.lower()
upper = True
converted.insert(0, l)
return "".join(converted) or '0'
def from32(q):
_32map = {
'0': 0,
'1': 1,
'2': 2,
'3': 3,
'4': 4,
'5': 5,
'6': 6,
'7': 7,
'8': 8,
'9': 9,
'A': 10,
'B': 11,
'C': 12,
'D': 13,
'E': 14,
'F': 15,
'G': 16,
'H': 17,
'J': 18,
'K': 19,
'M': 20,
'N': 21,
'P': 22,
'Q': 23,
'R': 24,
'S': 25,
'T': 26,
'V': 27,
'W': 28,
'X': 29,
'Y': 30,
'Z': 31,
'O': 0,
'I': 1,
'L': 1,
}
base32 = '0123456789ABCDEFGHIJKLMNOPQRSTUV'
q = q.replace('-','')
q = ''.join([base32[_32map[i.upper()]] for i in q])
return int(q, 32)
def to36(q):
"""
Converts an integer to base 36 (a useful scheme for human-sayable IDs
like 'fuck' (739172), 'shit' (1329077) or 'hitler' (1059538851)).
>>> to36(35)
'z'
>>> to36(119292)
'2k1o'
>>> int(to36(939387374), 36)
939387374
>>> to36(0)
'0'
>>> to36(-393)
Traceback (most recent call last):
...
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError, "must supply a positive integer"
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
converted = []
while q != 0:
q, r = divmod(q, 36)
converted.insert(0, letters[r])
return "".join(converted) or '0'
def from36(q):
return int(q, 36)
def intValue(strValue, default=u''):
"""
>>> intValue('abc23')
u'23'
>>> intValue(' abc23')
u'23'
>>> intValue('ab')
u''
"""
try:
val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
except:
val = default
return val
def floatValue(strValue, default=u''):
"""
>>> floatValue('abc23.4')
u'23.4'
>>> floatValue(' abc23.4')
u'23.4'
>>> floatValue('ab')
u''
"""
try:
val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
except:
val = default
return val
def formatNumber(number, longName, shortName):
"""
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
>>> formatNumber(123, 'Byte', 'B')
'123 Bytes'
>>> formatNumber(1234, 'Byte', 'B')
'1 KB'
>>> formatNumber(1234567, 'Byte', 'B')
'1.2 MB'
>>> formatNumber(1234567890, 'Byte', 'B')
'1.15 GB'
>>> formatNumber(1234567890123456789, 'Byte', 'B')
'1,096.5166 PB'
>>> formatNumber(-1234567890123456789, 'Byte', 'B')
'-1,096.5166 PB'
"""
if abs(number) < 1024:
return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '')
prefix = ['K', 'M', 'G', 'T', 'P']
for i in range(5):
if abs(number) < math.pow(1024, i + 2) or i == 4:
n = number / math.pow(1024, i + 1)
return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName)
def formatThousands(number, separator = ','):
"""
Return the number with separators (1,000,000)
>>> formatThousands(1)
'1'
>>> formatThousands(1000)
'1,000'
>>> formatThousands(1000000)
'1,000,000'
"""
string = str(number).split('.')
l = []
for i, character in enumerate(reversed(string[0])):
if i and (not (i % 3)):
l.insert(0, separator)
l.insert(0, character)
string[0] = ''.join(l)
return '.'.join(string)
def formatBits(number):
return formatNumber(number, 'bit', 'b')
def formatBytes(number):
return formatNumber(number, 'byte', 'B')
def formatPixels(number):
return formatNumber(number, 'pixel', 'px')
def formatCurrency(amount, currency="$"):
if amount:
temp = "%.2f" % amount
profile=re.compile(r"(\d)(\d\d\d[.,])")
while 1:
temp, count = re.subn(profile,r"\1,\2",temp)
if not count:
break
if temp.startswith('-'):
return "-"+ currency + temp[1:-3]
return currency + temp[:-3]
else:
return ""
def plural(amount, unit, plural='s'):
'''
>>> plural(1, 'unit')
'1 unit'
>>> plural(2, 'unit')
'2 units'
'''
if abs(amount) != 1:
if plural == 's':
unit = unit + plural
else: unit = plural
return "%s %s" % (formatThousands(amount), unit)
def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
'''
verbosity
0: D:HH:MM:SS
1: Dd Hh Mm Ss
2: D days H hours M minutes S seconds
years
True: 366 days are 1 year 1 day
False: 366 days are 366 days
hours
True: 30 seconds are 00:00:30
False: 30 seconds are 00:30
milliseconds
True: always display milliseconds
False: never display milliseconds
>>> formatDuration(1000 * 60 * 60 * 24 * 366)
'1:001:00:00:00.000'
>>> formatDuration(1000 * 60 * 60 * 24 * 366, years=False)
'366:00:00:00.000'
>>> formatDuration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
'1 year 2 seconds 3 milliseconds'
>>> formatDuration(1000 * 30, hours=False, milliseconds=False)
'00:30'
'''
if not ms and ms != 0:
return ''
if years:
y = int(ms / 31536000000)
d = int(ms % 31536000000 / 86400000)
else:
d = int(ms / 86400000)
h = int(ms % 86400000 / 3600000)
m = int(ms % 3600000 / 60000)
s = int(ms % 60000 / 1000)
ms = ms % 1000
if verbosity == 0:
if years and y:
duration = "%d:%03d:%02d:%02d:%02d" % (y, d, h, m, s)
elif d:
duration = "%d:%02d:%02d:%02d" % (d, h, m, s)
elif hours or h:
duration = "%02d:%02d:%02d" % (h, m, s)
else:
duration = "%02d:%02d" % (m, s)
if milliseconds:
duration += ".%03d" % ms
else:
if verbosity == 1:
durations = ["%sd" % d, "%sh" % h, "%sm" % m, "%ss" % s]
if years:
durations.insert(0, "%sy" % y)
if milliseconds:
durations.append("%sms" % ms)
else:
durations = [plural(d, 'day'), plural(h,'hour'),
plural(m, 'minute'), plural(s, 'second')]
if years:
durations.insert(0, plural(y, 'year'))
if milliseconds:
durations.append(plural(ms, 'millisecond'))
durations = filter(lambda x: not x.startswith('0'), durations)
duration = ' '.join(durations)
return duration
def ms2runtime(ms, shortenLong=False):
# deprecated - use formatDuration
'''
>>> ms2runtime(5000)
'5 seconds'
>>> ms2runtime(500000)
'8 minutes 20 seconds'
>>> ms2runtime(50000000)
'13 hours 53 minutes 20 seconds'
>>> ms2runtime(50000000-20000)
'13 hours 53 minutes'
'''
if shortenLong and ms > 1000 * 60 * 60 * 24 * 464:
return formatDuration(ms, verbosity=1, milliseconds=False)
return formatDuration(ms, verbosity=2, milliseconds=False)
def ms2playtime(ms, hours=False):
# deprecated - use formatDuration
'''
>>> ms2playtime(5000)
'00:05'
>>> ms2playtime(500000)
'08:20'
>>> ms2playtime(50000000)
'13:53:20'
'''
return formatDuration(ms, hours=False, years=False, milliseconds=False)
def ms2time(ms):
# deprecated - use formatDuration
'''
>>> ms2time(44592123)
'12:23:12.123'
'''
return formatDuration(ms, years=False)
def time2ms(timeString):
'''
>>> time2ms('12:23:12.123')
44592123
'''
ms = 0.0
p = timeString.split(':')
for i in range(len(p)):
_p = p[i]
if _p.endswith('.'): _p =_p[:-1]
ms = ms * 60 + float(_p)
return int(ms * 1000)
def shiftTime(offset, timeString):
newTime = time2ms(timeString) + offset
return ms2time(newTime)

172
ox/html.py Normal file
View file

@ -0,0 +1,172 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import re
import string
from htmlentitydefs import name2codepoint
# Configuration for urlize() function
LEADING_PUNCTUATION = ['(', '<', '&lt;']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;', "'", '"']
# list of possible strings used for bullets in bulleted lists
DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
word_split_re = re.compile(r'(\s+)')
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
del x # Temporary variable
def escape(html):
'''
Returns the given HTML with ampersands, quotes and carets encoded
>>> escape('html "test" & <brothers>')
'html &quot;test&quot; &amp; &lt;brothers&gt;'
'''
if not isinstance(html, basestring):
html = str(html)
return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')
def linebreaks(value):
'''
Converts newlines into <p> and <br />
'''
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
paras = re.split('\n{2,}', value)
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
return '\n\n'.join(paras)
def stripTags(value):
"""
Returns the given HTML with all tags stripped
>>> stripTags('some <h2>title</h2> <script>asdfasdf</script>')
'some title asdfasdf'
"""
return re.sub(r'<[^>]*?>', '', value)
def stripSpacesBetweenTags(value):
"Returns the given HTML with spaces between tags normalized to a single space"
return re.sub(r'>\s+<', '> <', value)
def stripEntities(value):
"Returns the given HTML with all entities (&something;) stripped"
return re.sub(r'&(?:\w+|#\d);', '', value)
def fixAmpersands(value):
"Returns the given HTML with all unencoded ampersands encoded correctly"
return unencoded_ampersands_re.sub('&amp;', value)
def urlize(text, trim_url_limit=None, nofollow=False):
"""
Converts any URLs in text into clickable links. Works on http://, https:// and
www. links. Links can have trailing punctuation (periods, commas, close-parens)
and leading punctuation (opening parens) and it'll still do the right thing.
If trim_url_limit is not None, the URLs in link text will be limited to
trim_url_limit characters.
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
"""
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or ''
for i, word in enumerate(words):
match = punctuation_re.match(word)
if match:
lead, middle, trail = match.groups()
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
len(middle) > 0 and middle[0] in string.letters + string.digits and \
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if middle.startswith('http://') or middle.startswith('https://'):
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
and simple_email_re.match(middle):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word:
words[i] = lead + middle + trail
return ''.join(words)
def cleanHtml(text):
"""
Cleans the given HTML. Specifically, it does the following:
* Converts <b> and <i> to <strong> and <em>.
* Encodes all ampersands correctly.
* Removes all "target" attributes from <a> tags.
* Removes extraneous HTML, such as presentational tags that open and
immediately close and <br clear="all">.
* Converts hard-coded bullets into HTML unordered lists.
* Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
bottom of the text.
"""
from text import normalizeNewlines
text = normalizeNewlines(text)
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
text = fixAmpersands(text)
# Remove all target="" attributes from <a> tags.
text = link_target_attribute_re.sub('\\1', text)
# Trim stupid HTML such as <br clear="all">.
text = html_gunk_re.sub('', text)
# Convert hard-coded bullets into HTML unordered lists.
def replace_p_tags(match):
s = match.group().replace('</p>', '</li>')
for d in DOTS:
s = s.replace('<p>%s' % d, '<li>')
return '<ul>\n%s\n</ul>' % s
text = hard_coded_bullets_re.sub(replace_p_tags, text)
# Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom of the text.
text = trailing_empty_content_re.sub('', text)
return text
# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def decodeHtml(html):
"""
>>> decodeHtml('me &amp; you and &#36;&#38;%')
u'me & you and $&%'
"""
if type(html) != unicode:
html = unicode(html)[:]
if type(html) is unicode:
uchr = unichr
else:
uchr = lambda value: value > 255 and unichr(value) or chr(value)
def entitydecode(match, uchr=uchr):
entity = match.group(1)
if entity.startswith('#x'):
return uchr(int(entity[2:], 16))
elif entity.startswith('#'):
return uchr(int(entity[1:]))
elif entity in name2codepoint:
return uchr(name2codepoint[entity])
else:
return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
def highlight(text, query, hlClass="hl"):
"""
>>> highlight('me &amp; you and &#36;&#38;%', 'and')
'me &amp; you <span class="hl">and</span> &#36;&#38;%'
"""
if query:
text = text.replace('<br />', '|')
query = re.escape(query).replace('\ ', '.')
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
for i in m:
text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
text = text.replace('|', '<br />')
return text

243
ox/iso.py Normal file
View file

@ -0,0 +1,243 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
_iso639_languages = [
("Unknown", "", "", "und"),
("Afar", "", "aa", "aar"),
("Abkhazian", "", "ab", "abk"),
("Afrikaans", "", "af", "afr"),
("Akan", "", "ak", "aka"),
("Albanian", "", "sq", "sqi"),
("Amharic", "", "am", "amh"),
("Arabic", "", "ar", "ara"),
("Aragonese", "", "an", "arg"),
("Armenian", "", "hy", "hye"),
("Assamese", "", "as", "asm"),
("Avaric", "", "av", "ava"),
("Avestan", "", "ae", "ave"),
("Aymara", "", "ay", "aym"),
("Azerbaijani", "", "az", "aze"),
("Bashkir", "", "ba", "bak"),
("Bambara", "", "bm", "bam"),
("Basque", "", "eu", "eus"),
("Belarusian", "", "be", "bel"),
("Bengali", "", "bn", "ben"),
("Bihari", "", "bh", "bih"),
("Bislama", "", "bi", "bis"),
("Bosnian", "", "bs", "bos"),
("Breton", "", "br", "bre"),
("Bulgarian", "", "bg", "bul"),
("Burmese", "", "my", "mya"),
("Catalan", "", "ca", "cat"),
("Chamorro", "", "ch", "cha"),
("Chechen", "", "ce", "che"),
("Chinese", "", "zh", "zho"),
("Church Slavic", "", "cu", "chu"),
("Chuvash", "", "cv", "chv"),
("Cornish", "", "kw", "cor"),
("Corsican", "", "co", "cos"),
("Cree", "", "cr", "cre"),
("Czech", "", "cs", "ces"),
("Danish", "Dansk", "da", "dan"),
("Divehi", "", "dv", "div"),
("Dutch", "Nederlands", "nl", "nld"),
("Dzongkha", "", "dz", "dzo"),
("English", "English", "en", "eng"),
("Esperanto", "", "eo", "epo"),
("Estonian", "", "et", "est"),
("Ewe", "", "ee", "ewe"),
("Faroese", "", "fo", "fao"),
("Fijian", "", "fj", "fij"),
("Finnish", "Suomi", "fi", "fin"),
("French", "Francais", "fr", "fra"),
("Western Frisian", "", "fy", "fry"),
("Fulah", "", "ff", "ful"),
("Georgian", "", "ka", "kat"),
("German", "Deutsch", "de", "deu"),
("Gaelic (Scots)", "", "gd", "gla"),
("Irish", "", "ga", "gle"),
("Galician", "", "gl", "glg"),
("Manx", "", "gv", "glv"),
("Greek, Modern", "", "el", "ell"),
("Guarani", "", "gn", "grn"),
("Gujarati", "", "gu", "guj"),
("Haitian", "", "ht", "hat"),
("Hausa", "", "ha", "hau"),
("Hebrew", "", "he", "heb"),
("Herero", "", "hz", "her"),
("Hindi", "", "hi", "hin"),
("Hiri Motu", "", "ho", "hmo"),
("Hungarian", "Magyar", "hu", "hun"),
("Igbo", "", "ig", "ibo"),
("Icelandic", "Islenska", "is", "isl"),
("Ido", "", "io", "ido"),
("Sichuan Yi", "", "ii", "iii"),
("Inuktitut", "", "iu", "iku"),
("Interlingue", "", "ie", "ile"),
("Interlingua", "", "ia", "ina"),
("Indonesian", "", "id", "ind"),
("Inupiaq", "", "ik", "ipk"),
("Italian", "Italiano", "it", "ita"),
("Javanese", "", "jv", "jav"),
("Japanese", "", "ja", "jpn"),
("Kalaallisut (Greenlandic)", "", "kl", "kal"),
("Kannada", "", "kn", "kan"),
("Kashmiri", "", "ks", "kas"),
("Kanuri", "", "kr", "kau"),
("Kazakh", "", "kk", "kaz"),
("Central Khmer", "", "km", "khm"),
("Kikuyu", "", "ki", "kik"),
("Kinyarwanda", "", "rw", "kin"),
("Kirghiz", "", "ky", "kir"),
("Komi", "", "kv", "kom"),
("Kongo", "", "kg", "kon"),
("Korean", "", "ko", "kor"),
("Kuanyama", "", "kj", "kua"),
("Kurdish", "", "ku", "kur"),
("Lao", "", "lo", "lao"),
("Latin", "", "la", "lat"),
("Latvian", "", "lv", "lav"),
("Limburgan", "", "li", "lim"),
("Lingala", "", "ln", "lin"),
("Lithuanian", "", "lt", "lit"),
("Luxembourgish", "", "lb", "ltz"),
("Luba-Katanga", "", "lu", "lub"),
("Ganda", "", "lg", "lug"),
("Macedonian", "", "mk", "mkd"),
("Marshallese", "", "mh", "mah"),
("Malayalam", "", "ml", "mal"),
("Maori", "", "mi", "mri"),
("Marathi", "", "mr", "mar"),
("Malay", "", "ms", "msa"),
("Malagasy", "", "mg", "mlg"),
("Maltese", "", "mt", "mlt"),
("Moldavian", "", "mo", "mol"),
("Mongolian", "", "mn", "mon"),
("Nauru", "", "na", "nau"),
("Navajo", "", "nv", "nav"),
("Ndebele, South", "", "nr", "nbl"),
("Ndebele, North", "", "nd", "nde"),
("Ndonga", "", "ng", "ndo"),
("Nepali", "", "ne", "nep"),
("Norwegian Nynorsk", "", "nn", "nno"),
("Norwegian Bokmål", "", "nb", "nob"),
("Norwegian", "Norsk", "no", "nor"),
("Chichewa; Nyanja", "", "ny", "nya"),
("Occitan (post 1500); Provençal", "", "oc", "oci"),
("Ojibwa", "", "oj", "oji"),
("Oriya", "", "or", "ori"),
("Oromo", "", "om", "orm"),
("Ossetian; Ossetic", "", "os", "oss"),
("Panjabi", "", "pa", "pan"),
("Persian", "", "fa", "fas"),
("Pali", "", "pi", "pli"),
("Polish", "", "pl", "pol"),
("Portuguese", "Portugues", "pt", "por"),
("Pushto", "", "ps", "pus"),
("Quechua", "", "qu", "que"),
("Romansh", "", "rm", "roh"),
("Romanian", "", "ro", "ron"),
("Rundi", "", "rn", "run"),
("Russian", "", "ru", "rus"),
("Sango", "", "sg", "sag"),
("Sanskrit", "", "sa", "san"),
("Serbian", "", "sr", "srp"),
("Croatian", "Hrvatski", "hr", "hrv"),
("Sinhala", "", "si", "sin"),
("Slovak", "", "sk", "slk"),
("Slovenian", "", "sl", "slv"),
("Northern Sami", "", "se", "sme"),
("Samoan", "", "sm", "smo"),
("Shona", "", "sn", "sna"),
("Sindhi", "", "sd", "snd"),
("Somali", "", "so", "som"),
("Sotho, Southern", "", "st", "sot"),
("Spanish", "Espanol", "es", "spa"),
("Sardinian", "", "sc", "srd"),
("Swati", "", "ss", "ssw"),
("Sundanese", "", "su", "sun"),
("Swahili", "", "sw", "swa"),
("Swedish", "Svenska", "sv", "swe"),
("Tahitian", "", "ty", "tah"),
("Tamil", "", "ta", "tam"),
("Tatar", "", "tt", "tat"),
("Telugu", "", "te", "tel"),
("Tajik", "", "tg", "tgk"),
("Tagalog", "", "tl", "tgl"),
("Thai", "", "th", "tha"),
("Tibetan", "", "bo", "bod"),
("Tigrinya", "", "ti", "tir"),
("Tonga (Tonga Islands)", "", "to", "ton"),
("Tswana", "", "tn", "tsn"),
("Tsonga", "", "ts", "tso"),
("Turkmen", "", "tk", "tuk"),
("Turkish", "", "tr", "tur"),
("Twi", "", "tw", "twi"),
("Uighur", "", "ug", "uig"),
("Ukrainian", "", "uk", "ukr"),
("Urdu", "", "ur", "urd"),
("Uzbek", "", "uz", "uzb"),
("Venda", "", "ve", "ven"),
("Vietnamese", "", "vi", "vie"),
("Volapük", "", "vo", "vol"),
("Welsh", "", "cy", "cym"),
("Walloon", "", "wa", "wln"),
("Wolof", "", "wo", "wol"),
("Xhosa", "", "xh", "xho"),
("Yiddish", "", "yi", "yid"),
("Yoruba", "", "yo", "yor"),
("Zhuang", "", "za", "zha"),
("Zulu", "", "zu", "zul"),
]
def codeToLang(code):
code = code.lower()
if len(code) == 2:
for l in _iso639_languages:
if l[2] == code:
return l[0]
elif len(code) == 3:
for l in _iso639_languages:
if l[3] == code:
return l[0]
return None
def langTo3Code(lang):
lang = langEnglishName(lang)
if lang:
lang=lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[3]
return None
def langTo2Code(lang):
lang = langEnglishName(lang)
if lang:
lang=lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[2]
return None
def langCode2To3(code):
langTo3Code(codeToLang(code))
def langCode3To2(code):
langTo2Code(codeToLang(code))
def langEnglishName(lang):
lang = lang.lower()
for l in _iso639_languages:
if l[1].lower() == lang:
return l[0]
return None
def languages2Letter():
languages = []
for l in _iso639_languages:
if l[2]:
languages.append(l[2])
return languages

89
ox/net.py Normal file
View file

@ -0,0 +1,89 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import os
import gzip
import StringIO
import urllib
import urllib2
from chardet.universaldetector import UniversalDetector
# Default headers for HTTP requests.
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; U; Linux i386; en-US; rv:1.9.1.1) Gecko/20090716 Firefox/3.5',
'Accept-Encoding': 'gzip'
}
def status(url, data=None, headers=DEFAULT_HEADERS):
try:
f = openUrl(url, data, headers)
s = f.code
except urllib2.HTTPError, e:
s = e.code
return s
def exists(url, data=None, headers=DEFAULT_HEADERS):
s = status(url, data, headers)
if s >= 200 and s < 400:
return True
return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
try:
f = openUrl(url, data, headers)
f.headers['Status'] = "%s" % f.code
headers = f.headers
f.close()
except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code
headers = e.headers
return dict(headers)
def openUrl(url, data=None, headers=DEFAULT_HEADERS):
url = url.replace(' ', '%20')
req = urllib2.Request(url, data, headers)
return urllib2.urlopen(req)
def readUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
f = openUrl(url, data, headers)
data = f.read()
f.close()
if f.headers.get('content-encoding', None) == 'gzip':
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
if returnHeaders:
f.headers['Status'] = "%s" % f.code
return dict(f.headers), data
return data
def readUrlUnicode(url):
data = readUrl(url)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def getEncoding(data):
if 'content="text/html; charset=utf-8"' in data:
return 'utf-8'
elif 'content="text/html; charset=iso-8859-1"' in data:
return 'iso-8859-1'
detector = UniversalDetector()
for line in data.split('\n'):
detector.feed(line)
if detector.done:
break
detector.close()
return detector.result['encoding']
def saveUrl(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
data = readUrl(url)
f = open(filename, 'w')
f.write(data)
f.close()

163
ox/normalize.py Normal file
View file

@ -0,0 +1,163 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import re
_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
u'\xd4\xef', u'\xcf\xe9')
# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
if article[-1] not in ("'", '-'): article += ' '
_spArticles.append(article)
def canonicalTitle(title):
"""Return the title in the canonic format 'Movie Title, The'.
>>> canonicalTitle('The Movie Title')
'Movie Title, The'
"""
try:
if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
except IndexError: pass
ltitle = title.lower()
for article in _spArticles:
if ltitle.startswith(article):
lart = len(article)
title = '%s, %s' % (title[lart:], title[:lart])
if article[-1] == ' ': title = title[:-1]
break
## XXX: an attempt using a dictionary lookup.
##for artSeparator in (' ', "'", '-'):
## article = _articlesDict.get(ltitle.split(artSeparator)[0])
## if article is not None:
## lart = len(article)
## # check titles like "una", "I'm Mad" and "L'abbacchio".
## if title[lart:] == '' or (artSeparator != ' ' and
## title[lart:][1] != artSeparator): continue
## title = '%s, %s' % (title[lart:], title[:lart])
## if artSeparator == ' ': title = title[1:]
## break
return title
def normalizeTitle(title):
"""Return the title in the normal "The Title" format.
>>> normalizeTitle('Movie Title, The')
'The Movie Title'
"""
stitle = title.split(', ')
if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
sep = ' '
if stitle[-1][-1] in ("'", '-'): sep = ''
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title
def normalizeImdbId(imdbId):
"""Return 7 digit imdbId.
>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
'0159206'
>>> normalizeImdbId(159206)
'0159206'
>>> normalizeImdbId('tt0159206')
'0159206'
"""
if isinstance(imdbId, basestring):
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
elif isinstance(imdbId, int):
imdbId = "%07d" % imdbId
return imdbId
# Common suffixes in surnames.
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')
def canonicalName(name):
"""Return the given name in canonical "Surname, Name" format.
It assumes that name is in the 'Name Surname' format.
>>> canonicalName('Jean Luc Godard')
'Godard, Jean Luc'
>>> canonicalName('Ivan Ivanov-Vano')
'Ivanov-Vano, Ivan'
>>> canonicalName('Gus Van Sant')
'Van Sant, Gus'
>>> canonicalName('Brian De Palma')
'De Palma, Brian'
"""
# XXX: some statistics (over 1852406 names):
# - just a surname: 51921
# - single surname, single name: 1792759
# - composed surname, composed name: 7726
# - composed surname, single name: 55623
# (2: 49259, 3: 5502, 4: 551)
# - single surname, composed name: 186604
# (2: 178315, 3: 6573, 4: 1219, 5: 352)
# Don't convert names already in the canonical format.
if name in ('Unknown Director', ):
return name
if name.find(', ') != -1: return name
sname = name.split(' ')
snl = len(sname)
if snl == 2:
# Just a name and a surname: how boring...
name = '%s, %s' % (sname[1], sname[0])
elif snl > 2:
lsname = [x.lower() for x in sname]
if snl == 3: _indexes = (0, snl-2)
else: _indexes = (0, snl-2, snl-3)
# Check for common surname prefixes at the beginning and near the end.
for index in _indexes:
if lsname[index] not in _sname_suffixes: continue
try:
# Build the surname.
surn = '%s %s' % (sname[index], sname[index+1])
del sname[index]
del sname[index]
try:
# Handle the "Jr." after the name.
if lsname[index+2].startswith('jr'):
surn += ' %s' % sname[index]
del sname[index]
except (IndexError, ValueError):
pass
name = '%s, %s' % (surn, ' '.join(sname))
break
except ValueError:
continue
else:
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
return name
def normalizeName(name):
"""Return a name in the normal "Name Surname" format.
>>> normalizeName('Godard, Jean Luc')
'Jean Luc Godard'
>>> normalizeName('Ivanov-Vano, Ivan')
'Ivan Ivanov-Vano'
>>> normalizeName('Van Sant, Gus')
'Gus Van Sant'
>>> normalizeName('De Palma, Brian')
'Brian De Palma'
"""
sname = name.split(', ')
if len(sname) == 2:
name = '%s %s' % (sname[1], sname[0])
return name

270
ox/text.py Normal file
View file

@ -0,0 +1,270 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import math
import re
def findRe(string, regexp):
result = re.compile(regexp, re.DOTALL).findall(string)
if result:
return result[0].strip()
return ''
def findString(string, string0='', string1 = ''):
"""Return the string between string0 and string1.
If string0 or string1 is left out, begining or end of string is used.
>>> findString('i am not there', string1=' not there')
'i am'
>>> findString('i am not there', 'i am ', ' there')
'not'
>>> findString('i am not there', 'i am not t')
'here'
"""
if string0:
string0 = re.escape(string0)
else:
string0 = '^'
if string1:
string1 = re.escape(string1)
else:
string1 = '$'
return findRe(string, string0 + '(.*?)' + string1)
def removeSpecialCharacters(text):
"""
Removes special characters inserted by Word.
"""
text = text.replace(u'\u2013', '-')
text = text.replace(u'\u2026O', "'")
text = text.replace(u'\u2019', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "-")
return text
def wrap(text, width):
"""
A word-wrap function that preserves existing line breaks and most spaces in
the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
"""
return reduce(lambda line, word, width=width: '%s%s%s' %
(line,
' \n'[(len(line[line.rfind('\n')+1:])
+ len(word.split('\n',1)[0]
) >= width)],
word),
text.split(' ')
)
def wrapString(string, length=80, separator='\n', balance=False):
'''
>>> wrapString(u"Anticonstitutionellement, Paris s'eveille", 16)
u"Anticonstitution\\nellement, Paris \\ns'eveille"
>>> wrapString(u'All you can eat', 12, '\\n', True)
u'All you \\ncan eat'
'''
words = string.split(' ')
if balance:
# balance lines: test if same number of lines
# can be achieved with a shorter line length
lines = wrapString(string, length, separator, False).split(separator)
if len(lines) > 1:
while length > max(map(lambda x : len(x), words)):
length -= 1
if len(wrapString(string, length, separator, False).split(separator)) > len(lines):
length += 1
break
lines = ['']
for word in words:
if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
# word fits in current line
lines[len(lines) - 1] += word + u' ';
else:
if len(word) <= length:
# word fits in next line
lines.append(word + u' ')
else:
# word is longer than line
position = length - len(lines[len(lines) - 1])
lines[len(lines) - 1] += word[0:position]
for i in range(position, len(word), length):
lines.append(word[i:i+length]);
lines[len(lines) - 1] += u' '
return separator.join(lines).strip()
def truncateString(string, length, padding='...', position='right'):
# >>> truncateString('anticonstitutionellement', 16, '...', 'left')
# '...utionellement'
# >>> truncateString('anticonstitutionellement', 16, '...', 'center')
# 'anticon...lement'
# >>> truncateString('anticonstitutionellement', 16, '...', 'right')
# 'anticonstitut...'
stringLength = len(string);
paddingLength = len(padding)
if stringLength > length:
if position == 'left':
string = '%s%s' % (padding, string[stringLength + paddingLength - length:])
elif position == 'center':
left = int(math.ceil(float(length - paddingLength) / 2))
right = int(stringLength - math.floor(float(length - paddingLength) / 2))
string = '%s%s%s' % (string[:left], padding, string[right:])
elif position == 'right':
string = '%s%s' % (string[:length - paddingLength], padding)
return string;
def truncateWords(s, num):
"""Truncates a string after a certain number of chacters, but ends with a word
>>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23)
'Truncates a string...'
>>> truncateString('Truncates a string', 23)
'Truncates a string'
"""
length = int(num)
if len(s) <= length:
return s
words = s.split()
ts = ""
while words and len(ts) + len(words[0]) < length:
ts += " " + words.pop(0)
if words:
ts += "..."
return ts.strip()
def trimString(string, num):
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
>>> trimString('Truncates a string after a certain number of chacters', 23)
'Truncates ...f chacters'
>>> trimString('Truncates a string', 23)
'Truncates a string'
"""
if len(string) > num:
string = string[:num - 13] + '...' + string[-10:]
return string
def truncateWords(s, num):
"Truncates a string after a certain number of words."
length = int(num)
words = s.split()
if len(words) > length:
words = words[:length]
if not words[-1].endswith('...'):
words.append('...')
return ' '.join(words)
def getValidFilename(s):
"""
Returns the given string converted to a string that can be used for a clean
filename. Specifically, leading and trailing spaces are removed;
all non-filename-safe characters are removed.
>>> getValidFilename("john's portrait in 2004.jpg")
'john_s_portrait_in_2004.jpg'
"""
s = s.strip()
s = s.replace(' ', '_')
s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
s = s.replace('__', '_').replace('__', '_')
return s
def getTextList(list_, last_word='or'):
"""
>>> getTextList([u'a', u'b', u'c', u'd'])
u'a, b, c or d'
>>> getTextList([u'a', u'b', u'c'], 'and')
u'a, b and c'
>>> getTextList([u'a', u'b'], 'and')
u'a and b'
>>> getTextList([u'a'])
u'a'
>>> getTextList([])
''
"""
if len(list_) == 0: return ''
if len(list_) == 1: return list_[0]
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
def getListText(text, last_word='or'):
"""
>>> getListText(u'a, b, c or d')
[u'a', u'b', u'c', u'd']
>>> getListText(u'a, b and c', u'and')
[u'a', u'b', u'c']
>>> getListText(u'a and b', u'and')
[u'a', u'b']
>>> getListText(u'a')
[u'a']
>>> getListText(u'')
[]
"""
list_ = []
if text:
list_ = text.split(u', ')
if list_:
i=len(list_)-1
last = list_[i].split(last_word)
if len(last) == 2:
list_[i] = last[0].strip()
list_.append(last[1].strip())
return list_
def normalizeNewlines(text):
return re.sub(r'\r\n|\r|\n', '\n', text)
def recapitalize(text):
"Recapitalizes text, placing caps after end-of-sentence punctuation."
#capwords = ()
text = text.lower()
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
text = capsRE.sub(lambda x: x.group(1).upper(), text)
#for capword in capwords:
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
# text = capwordRE.sub(capword, text)
return text
def phone2numeric(phone):
"Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I)
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'}.get(m.group(0).lower())
return letters.sub(char2number, phone)
def compressString(s):
import cStringIO, gzip
zbuf = cStringIO.StringIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s)
zfile.close()
return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smartSplit(text):
"""
Generator that splits a string by spaces, leaving quoted phrases together.
Supports both single and double quotes, and supports escaping quotes with
backslashes. In the output, strings will keep their initial and trailing
quote marks.
>>> list(smartSplit('This is "a person\\'s" test.'))
['This', 'is', '"a person\\'s"', 'test.']
"""
for bit in smart_split_re.finditer(text):
bit = bit.group(0)
if bit[0] == '"':
yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
elif bit[0] == "'":
yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
else:
yield bit

74
ox/torrent/__init__.py Normal file
View file

@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2007-2009
from threading import Event
import hashlib
import os
from bencode import bencode, bdecode
__all__ = ['createTorrent', 'getInfoHash', 'getTorrentInfoFromFile', 'getTorrentInfo', 'getFiles', 'getTorrentSize']
def createTorrent(file, url, params = {}, flag = Event(),
progress = lambda x: None, progress_percent = 1):
"Creates a torrent for a given file, using url as tracker url"
from makemetafile import make_meta_file
return make_meta_file(file, url, params, flag, progress, progress_percent)
def getInfoHash(torrentFile):
"Returns Torrent Info Hash from torrent file"
metainfo_file = open(torrentFile, 'rb')
metainfo = bdecode(metainfo_file.read())
info = metainfo['info']
return hashlib.sha1(bencode(info)).hexdigest()
def getTorrentInfoFromFile(torrentFile):
f = open(torrentFile, 'rb')
data = f.read()
f.close()
tinfo = getTorrentInfo(data)
tinfo['timestamp'] = os.stat(torrentFile).st_ctime
return tinfo
def getTorrentInfo(data):
"Returns Torrent Info from torrent file"
tinfo = {}
metainfo = bdecode(data)
info = metainfo['info']
piece_length = info['piece length']
if info.has_key('length'):
# let's assume we just have one file
file_length = info['length']
else:
# let's assume we have a directory structure
file_length = 0;
for f in info['files']:
file_length += f['length']
for key in info:
if key != 'pieces':
tinfo[key] = info[key]
for key in metainfo:
if key != 'info':
tinfo[key] = metainfo[key]
tinfo['size'] = file_length
tinfo['hash'] = hashlib.sha1(bencode(info)).hexdigest()
tinfo['announce'] = metainfo['announce']
return tinfo
def getFiles(data):
files = []
info = getTorrentInfo(data)
if 'files' in info:
for f in info['files']:
path = [info['name'], ]
path.extend(f['path'])
files.append(os.path.join(*path))
else:
files.append(info['name'])
return files
def getTorrentSize(torrentFile):
"Returns Size of files in torrent file in bytes"
return getTorrentInfo(torrentFile)['size']

320
ox/torrent/bencode.py Normal file
View file

@ -0,0 +1,320 @@
# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman
# see LICENSE.txt for license information
from types import IntType, LongType, StringType, ListType, TupleType, DictType
try:
from types import BooleanType
except ImportError:
BooleanType = None
try:
from types import UnicodeType
except ImportError:
UnicodeType = None
from cStringIO import StringIO
def decode_int(x, f):
f += 1
newf = x.index('e', f)
try:
n = int(x[f:newf])
except:
n = long(x[f:newf])
if x[f] == '-':
if x[f + 1] == '0':
raise ValueError
elif x[f] == '0' and newf != f+1:
raise ValueError
return (n, newf+1)
def decode_string(x, f):
colon = x.index(':', f)
try:
n = int(x[f:colon])
except (OverflowError, ValueError):
n = long(x[f:colon])
if x[f] == '0' and colon != f+1:
raise ValueError
colon += 1
return (x[colon:colon+n], colon+n)
def decode_unicode(x, f):
s, f = decode_string(x, f+1)
return (s.decode('UTF-8'),f)
def decode_list(x, f):
r, f = [], f+1
while x[f] != 'e':
v, f = decode_func[x[f]](x, f)
r.append(v)
return (r, f + 1)
def decode_dict(x, f):
r, f = {}, f+1
lastkey = None
while x[f] != 'e':
k, f = decode_string(x, f)
#why is this needed
#if lastkey >= k:
# raise ValueError
lastkey = k
r[k], f = decode_func[x[f]](x, f)
return (r, f + 1)
decode_func = {}
decode_func['l'] = decode_list
decode_func['d'] = decode_dict
decode_func['i'] = decode_int
decode_func['0'] = decode_string
decode_func['1'] = decode_string
decode_func['2'] = decode_string
decode_func['3'] = decode_string
decode_func['4'] = decode_string
decode_func['5'] = decode_string
decode_func['6'] = decode_string
decode_func['7'] = decode_string
decode_func['8'] = decode_string
decode_func['9'] = decode_string
#decode_func['u'] = decode_unicode
def bdecode(x, sloppy = 1):
try:
r, l = decode_func[x[0]](x, 0)
# except (IndexError, KeyError):
except (IndexError, KeyError, ValueError):
raise ValueError, "bad bencoded data"
if not sloppy and l != len(x):
raise ValueError, "bad bencoded data"
return r
def test_bdecode():
try:
bdecode('0:0:')
assert 0
except ValueError:
pass
try:
bdecode('ie')
assert 0
except ValueError:
pass
try:
bdecode('i341foo382e')
assert 0
except ValueError:
pass
assert bdecode('i4e') == 4L
assert bdecode('i0e') == 0L
assert bdecode('i123456789e') == 123456789L
assert bdecode('i-10e') == -10L
try:
bdecode('i-0e')
assert 0
except ValueError:
pass
try:
bdecode('i123')
assert 0
except ValueError:
pass
try:
bdecode('')
assert 0
except ValueError:
pass
try:
bdecode('i6easd')
assert 0
except ValueError:
pass
try:
bdecode('35208734823ljdahflajhdf')
assert 0
except ValueError:
pass
try:
bdecode('2:abfdjslhfld')
assert 0
except ValueError:
pass
assert bdecode('0:') == ''
assert bdecode('3:abc') == 'abc'
assert bdecode('10:1234567890') == '1234567890'
try:
bdecode('02:xy')
assert 0
except ValueError:
pass
try:
bdecode('l')
assert 0
except ValueError:
pass
assert bdecode('le') == []
try:
bdecode('leanfdldjfh')
assert 0
except ValueError:
pass
assert bdecode('l0:0:0:e') == ['', '', '']
try:
bdecode('relwjhrlewjh')
assert 0
except ValueError:
pass
assert bdecode('li1ei2ei3ee') == [1, 2, 3]
assert bdecode('l3:asd2:xye') == ['asd', 'xy']
assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]]
try:
bdecode('d')
assert 0
except ValueError:
pass
try:
bdecode('defoobar')
assert 0
except ValueError:
pass
assert bdecode('de') == {}
assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'}
assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}}
try:
bdecode('d3:fooe')
assert 0
except ValueError:
pass
try:
bdecode('di1e0:e')
assert 0
except ValueError:
pass
try:
bdecode('d1:b0:1:a0:e')
assert 0
except ValueError:
pass
try:
bdecode('d1:a0:1:a0:e')
assert 0
except ValueError:
pass
try:
bdecode('i03e')
assert 0
except ValueError:
pass
try:
bdecode('l01:ae')
assert 0
except ValueError:
pass
try:
bdecode('9999:x')
assert 0
except ValueError:
pass
try:
bdecode('l0:')
assert 0
except ValueError:
pass
try:
bdecode('d0:0:')
assert 0
except ValueError:
pass
try:
bdecode('d0:')
assert 0
except ValueError:
pass
bencached_marker = []
class Bencached:
def __init__(self, s):
self.marker = bencached_marker
self.bencoded = s
BencachedType = type(Bencached('')) # insufficient, but good as a filter
def encode_bencached(x,r):
assert x.marker == bencached_marker
r.append(x.bencoded)
def encode_int(x,r):
r.extend(('i',str(x),'e'))
def encode_bool(x,r):
encode_int(int(x),r)
def encode_string(x,r):
r.extend((str(len(x)),':',x))
def encode_unicode(x,r):
#r.append('u')
encode_string(x.encode('UTF-8'),r)
def encode_list(x,r):
r.append('l')
for e in x:
encode_func[type(e)](e, r)
r.append('e')
def encode_dict(x,r):
r.append('d')
ilist = x.items()
ilist.sort()
for k,v in ilist:
r.extend((str(len(k)),':',k))
encode_func[type(v)](v, r)
r.append('e')
encode_func = {}
encode_func[BencachedType] = encode_bencached
encode_func[IntType] = encode_int
encode_func[LongType] = encode_int
encode_func[StringType] = encode_string
encode_func[ListType] = encode_list
encode_func[TupleType] = encode_list
encode_func[DictType] = encode_dict
if BooleanType:
encode_func[BooleanType] = encode_bool
if UnicodeType:
encode_func[UnicodeType] = encode_unicode
def bencode(x):
r = []
try:
encode_func[type(x)](x, r)
except:
print "*** error *** could not encode type %s (value: %s)" % (type(x), x)
assert 0
return ''.join(r)
def test_bencode():
assert bencode(4) == 'i4e'
assert bencode(0) == 'i0e'
assert bencode(-10) == 'i-10e'
assert bencode(12345678901234567890L) == 'i12345678901234567890e'
assert bencode('') == '0:'
assert bencode('abc') == '3:abc'
assert bencode('1234567890') == '10:1234567890'
assert bencode([]) == 'le'
assert bencode([1, 2, 3]) == 'li1ei2ei3ee'
assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee'
assert bencode({}) == 'de'
assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee'
assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee'
try:
bencode({1: 'foo'})
assert 0
except AssertionError:
pass
try:
import psyco
psyco.bind(bdecode)
psyco.bind(bencode)
except ImportError:
pass

100
ox/torrent/btformats.py Normal file
View file

@ -0,0 +1,100 @@
# Written by Bram Cohen
# see LICENSE.txt for license information
from types import StringType, LongType, IntType, ListType, DictType
from re import compile
reg = compile(r'^[^/\\.~][^/\\]*$')
ints = (LongType, IntType)
def check_info(info):
if type(info) != DictType:
raise ValueError, 'bad metainfo - not a dictionary'
pieces = info.get('pieces')
if type(pieces) != StringType or len(pieces) % 20 != 0:
raise ValueError, 'bad metainfo - bad pieces key'
piecelength = info.get('piece length')
if type(piecelength) not in ints or piecelength <= 0:
raise ValueError, 'bad metainfo - illegal piece length'
name = info.get('name')
if type(name) != StringType:
raise ValueError, 'bad metainfo - bad name'
if not reg.match(name):
raise ValueError, 'name %s disallowed for security reasons' % name
if info.has_key('files') == info.has_key('length'):
raise ValueError, 'single/multiple file mix'
if info.has_key('length'):
length = info.get('length')
if type(length) not in ints or length < 0:
raise ValueError, 'bad metainfo - bad length'
else:
files = info.get('files')
if type(files) != ListType:
raise ValueError
for f in files:
if type(f) != DictType:
raise ValueError, 'bad metainfo - bad file value'
length = f.get('length')
if type(length) not in ints or length < 0:
raise ValueError, 'bad metainfo - bad length'
path = f.get('path')
if type(path) != ListType or path == []:
raise ValueError, 'bad metainfo - bad path'
for p in path:
if type(p) != StringType:
raise ValueError, 'bad metainfo - bad path dir'
if not reg.match(p):
raise ValueError, 'path %s disallowed for security reasons' % p
for i in xrange(len(files)):
for j in xrange(i):
if files[i]['path'] == files[j]['path']:
raise ValueError, 'bad metainfo - duplicate path'
def check_message(message):
if type(message) != DictType:
raise ValueError
check_info(message.get('info'))
if type(message.get('announce')) != StringType:
raise ValueError
def check_peers(message):
if type(message) != DictType:
raise ValueError
if message.has_key('failure reason'):
if type(message['failure reason']) != StringType:
raise ValueError
return
peers = message.get('peers')
if type(peers) == ListType:
for p in peers:
if type(p) != DictType:
raise ValueError
if type(p.get('ip')) != StringType:
raise ValueError
port = p.get('port')
if type(port) not in ints or p <= 0:
raise ValueError
if p.has_key('peer id'):
id = p['peer id']
if type(id) != StringType or len(id) != 20:
raise ValueError
elif type(peers) != StringType or len(peers) % 6 != 0:
raise ValueError
interval = message.get('interval', 1)
if type(interval) not in ints or interval <= 0:
raise ValueError
minint = message.get('min interval', 1)
if type(minint) not in ints or minint <= 0:
raise ValueError
if type(message.get('tracker id', '')) != StringType:
raise ValueError
npeers = message.get('num peers', 0)
if type(npeers) not in ints or npeers < 0:
raise ValueError
dpeers = message.get('done peers', 0)
if type(dpeers) not in ints or dpeers < 0:
raise ValueError
last = message.get('last', 0)
if type(last) not in ints or last < 0:
raise ValueError

270
ox/torrent/makemetafile.py Normal file
View file

@ -0,0 +1,270 @@
# Written by Bram Cohen
# multitracker extensions by John Hoffman
# see LICENSE.txt for license information
from os.path import getsize, split, join, abspath, isdir
from os import listdir
from hashlib import sha1 as sha
from copy import copy
from string import strip
from bencode import bencode
from btformats import check_info
from threading import Event
from time import time
from traceback import print_exc
try:
from sys import getfilesystemencoding
ENCODING = getfilesystemencoding()
except:
from sys import getdefaultencoding
ENCODING = getdefaultencoding()
defaults = [
('announce_list', '',
'a list of announce URLs - explained below'),
('httpseeds', '',
'a list of http seed URLs - explained below'),
('piece_size_pow2', 0,
"which power of 2 to set the piece size to (0 = automatic)"),
('comment', '',
"optional human-readable comment to put in .torrent"),
('filesystem_encoding', '',
"optional specification for filesystem encoding " +
"(set automatically in recent Python versions)"),
('target', '',
"optional target file for the torrent")
]
default_piece_len_exp = 18
ignore = ['core', 'CVS']
def print_announcelist_details():
print (' announce_list = optional list of redundant/backup tracker URLs, in the format:')
print (' url[,url...][|url[,url...]...]')
print (' where URLs separated by commas are all tried first')
print (' before the next group of URLs separated by the pipe is checked.')
print (" If none is given, it is assumed you don't want one in the metafile.")
print (' If announce_list is given, clients which support it')
print (' will ignore the <announce> value.')
print (' Examples:')
print (' http://tracker1.com|http://tracker2.com|http://tracker3.com')
print (' (tries trackers 1-3 in order)')
print (' http://tracker1.com,http://tracker2.com,http://tracker3.com')
print (' (tries trackers 1-3 in a randomly selected order)')
print (' http://tracker1.com|http://backup1.com,http://backup2.com')
print (' (tries tracker 1 first, then tries between the 2 backups randomly)')
print ('')
print (' httpseeds = optional list of http-seed URLs, in the format:')
print (' url[|url...]')
def make_meta_file(file, url, params = {}, flag = Event(),
progress = lambda x: None, progress_percent = 1):
if params.has_key('piece_size_pow2'):
piece_len_exp = params['piece_size_pow2']
else:
piece_len_exp = default_piece_len_exp
if params.has_key('target') and params['target'] != '':
f = params['target']
else:
a, b = split(file)
if b == '':
f = a + '.torrent'
else:
f = join(a, b + '.torrent')
if piece_len_exp == 0: # automatic
size = calcsize(file)
if size > 8L*1024*1024*1024: # > 8 gig =
piece_len_exp = 21 # 2 meg pieces
elif size > 2*1024*1024*1024: # > 2 gig =
piece_len_exp = 20 # 1 meg pieces
elif size > 512*1024*1024: # > 512M =
piece_len_exp = 19 # 512K pieces
elif size > 64*1024*1024: # > 64M =
piece_len_exp = 18 # 256K pieces
elif size > 16*1024*1024: # > 16M =
piece_len_exp = 17 # 128K pieces
elif size > 4*1024*1024: # > 4M =
piece_len_exp = 16 # 64K pieces
else: # < 4M =
piece_len_exp = 15 # 32K pieces
piece_length = 2 ** piece_len_exp
encoding = None
if params.has_key('filesystem_encoding'):
encoding = params['filesystem_encoding']
if not encoding:
encoding = ENCODING
if not encoding:
encoding = 'ascii'
info = makeinfo(file, piece_length, encoding, flag, progress, progress_percent)
if flag.isSet():
return
check_info(info)
h = open(f, 'wb')
data = {'info': info, 'announce': strip(url), 'creation date': long(time())}
if params.has_key('comment') and params['comment']:
data['comment'] = params['comment']
if params.has_key('real_announce_list'): # shortcut for progs calling in from outside
data['announce-list'] = params['real_announce_list']
elif params.has_key('announce_list') and params['announce_list']:
l = []
for tier in params['announce_list'].split('|'):
l.append(tier.split(','))
data['announce-list'] = l
if params.has_key('real_httpseeds'): # shortcut for progs calling in from outside
data['httpseeds'] = params['real_httpseeds']
elif params.has_key('httpseeds') and params['httpseeds']:
data['httpseeds'] = params['httpseeds'].split('|')
if params.has_key('url-list') and params['url-list']:
data['url-list'] = params['url-list'].split('|')
if params.has_key('playtime') and params['playtime']:
data['info']['playtime'] = params['playtime']
h.write(bencode(data))
h.close()
def calcsize(file):
if not isdir(file):
return getsize(file)
total = 0L
for s in subfiles(abspath(file)):
total += getsize(s[1])
return total
def uniconvertl(l, e):
r = []
try:
for s in l:
r.append(uniconvert(s, e))
except UnicodeError:
raise UnicodeError('bad filename: '+join(*l))
return r
def uniconvert(s, e):
try:
if s.__class__.__name__ != 'unicode':
s = unicode(s,e)
except UnicodeError:
raise UnicodeError('bad filename: '+s)
return s.encode('utf-8')
def makeinfo(file, piece_length, encoding, flag, progress, progress_percent=1):
file = abspath(file)
if isdir(file):
subs = subfiles(file)
subs.sort()
pieces = []
sh = sha()
done = 0L
fs = []
totalsize = 0.0
totalhashed = 0L
for p, f in subs:
totalsize += getsize(f)
for p, f in subs:
pos = 0L
size = getsize(f)
fs.append({'length': size, 'path': uniconvertl(p, encoding)})
h = open(f, 'rb')
while pos < size:
a = min(size - pos, piece_length - done)
sh.update(h.read(a))
if flag.isSet():
return
done += a
pos += a
totalhashed += a
if done == piece_length:
pieces.append(sh.digest())
done = 0
sh = sha()
if progress_percent:
progress(totalhashed / totalsize)
else:
progress(a)
h.close()
if done > 0:
pieces.append(sh.digest())
return {'pieces': ''.join(pieces),
'piece length': piece_length, 'files': fs,
'name': uniconvert(split(file)[1], encoding) }
else:
size = getsize(file)
pieces = []
p = 0L
h = open(file, 'rb')
while p < size:
x = h.read(min(piece_length, size - p))
if flag.isSet():
return
pieces.append(sha(x).digest())
p += piece_length
if p > size:
p = size
if progress_percent:
progress(float(p) / size)
else:
progress(min(piece_length, size - p))
h.close()
return {'pieces': ''.join(pieces),
'piece length': piece_length, 'length': size,
'name': uniconvert(split(file)[1], encoding) }
def subfiles(d):
r = []
stack = [([], d)]
while len(stack) > 0:
p, n = stack.pop()
if isdir(n):
for s in listdir(n):
if s not in ignore and s[:1] != '.':
stack.append((copy(p) + [s], join(n, s)))
else:
r.append((p, n))
return r
def completedir(dir, url, params = {}, flag = Event(),
vc = lambda x: None, fc = lambda x: None):
files = listdir(dir)
files.sort()
ext = '.torrent'
if params.has_key('target'):
target = params['target']
else:
target = ''
togen = []
for f in files:
if f[-len(ext):] != ext and (f + ext) not in files:
togen.append(join(dir, f))
total = 0
for i in togen:
total += calcsize(i)
subtotal = [0]
def callback(x, subtotal = subtotal, total = total, vc = vc):
subtotal[0] += x
vc(float(subtotal[0]) / total)
for i in togen:
fc(i)
try:
t = split(i)[-1]
if t not in ignore and t[0] != '.':
if target != '':
params['target'] = join(target,t+ext)
make_meta_file(i, url, params, flag, progress = callback, progress_percent = 0)
except ValueError:
print_exc()