rename, use namespaces

2010-07-08 00:34:04 +02:00 · 2010-07-08 00:34:04 +02:00 · 0d354d2574
commit 0d354d2574
parent 208250d863
15 changed files with 7 additions and 7 deletions
--- a/ox/init.py
+++ b/ox/init.py
@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+# GPL 2008
+__version__ = '1.0.0'
+
+from file import *
+from format import *
+from html import *
+from iso import *
+from text import *
+from form import *
+import cache
+import net
+
+from torrent import *
+
+
--- a/ox/cache.py
+++ b/ox/cache.py
@ -0,0 +1,224 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+# GPL 2008
+import gzip
+import hashlib
+import os
+import StringIO
+import time
+import urlparse
+import urllib2
+import sqlite3
+
+import chardet
+import simplejson
+
+import net
+from net import DEFAULT_HEADERS, getEncoding
+
+
+cache_timeout = 30*24*60*60 # default is 30 days
+
+
+def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
+    '''
+      >>> status('http://google.com')
+      200
+      >>> status('http://google.com/mysearch')
+      404
+    '''
+    headers = getHeaders(url, data, headers)
+    return int(headers['status'])
+
+def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
+    '''
+      >>> exists('http://google.com')
+      True
+      >>> exists('http://google.com/mysearch')
+      False
+    '''
+    s = status(url, data, headers, timeout)
+    if s >= 200 and s < 400:
+        return True
+    return False
+
+def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
+    url_headers = _readUrlCache(url, data, headers, timeout, "headers")
+    if url_headers:
+        url_headers = simplejson.loads(url_headers)
+    else:
+        url_headers = net.getHeaders(url, data, headers)
+        _saveUrlCache(url, data, -1, url_headers)
+    return url_headers
+
+class InvalidResult(Exception):
+    """Base class for exceptions in this module."""
+    def __init__(self, result, headers):
+        self.result = result
+        self.headers = headers
+
+def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None):
+    '''
+        url     - url to load
+        data    - possible post data
+        headers - headers to send with request
+        timeout - get from cache if cache not older than given seconds, -1 to get from cache
+        valid   - function to check if result is ok, its passed result and headers
+                  if this function fails, InvalidResult will be raised deal with it in your code 
+    '''
+    #FIXME: send last-modified / etag from cache and only update if needed
+    if isinstance(url, unicode):
+        url = url.encode('utf-8')
+    result = _readUrlCache(url, data, headers, timeout)
+    if not result:
+        #print "get data", url
+        try:
+            url_headers, result = net.readUrl(url, data, headers, returnHeaders=True)
+        except urllib2.HTTPError, e:
+            e.headers['Status'] = "%s" % e.code
+            url_headers = dict(e.headers)
+            result = e.read()
+            if url_headers.get('content-encoding', None) == 'gzip':
+                result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
+        if not valid or valid(result, url_headers):
+            _saveUrlCache(url, data, result, url_headers)
+        else:
+            raise InvalidResult(result, url_headers)
+    return result
+
+def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _readUrl=readUrl, valid=None):
+    data = _readUrl(url, data, headers, timeout, valid)
+    encoding = getEncoding(data)
+    if not encoding:
+        encoding = 'latin-1'
+    return unicode(data, encoding)
+
+def saveUrl(url, filename, overwrite=False):
+    if not os.path.exists(filename) or overwrite:
+        dirname = os.path.dirname(filename)
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+        data = readUrl(url)
+        f = open(filename, 'w')
+        f.write(data)
+        f.close()
+
+def _getCacheBase():
+    'cache base is eather ~/.ox/cache or can set via env variable oxCACHE'
+    return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))
+
+def _getCacheDB():
+    path = _getCacheBase()
+    if not os.path.exists(path):
+        os.makedirs(path)
+    return os.path.join(path, "cache.sqlite")
+
+def _connectDb():
+    conn = sqlite3.connect(_getCacheDB(), timeout=10)
+    conn.text_factory = str
+    return conn
+
+def _createDb(c):
+    # Create table and indexes 
+    c.execute('''CREATE TABLE IF NOT EXISTS cache (url_hash varchar(42) unique, domain text, url text,
+                      post_data text, headers text, created int, data blob, only_headers int)''')
+    c.execute('''CREATE INDEX IF NOT EXISTS cache_domain ON cache (domain)''')
+    c.execute('''CREATE INDEX IF NOT EXISTS cache_url ON cache (url)''')
+    c.execute('''CREATE INDEX IF NOT EXISTS cache_url_hash ON cache (url_hash)''')
+
+
+def _readUrlCache(url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
+    r = None
+    if timeout == 0:
+        return r
+
+    if data:
+        url_hash = hashlib.sha1(url + '?' + data).hexdigest()
+    else:
+        url_hash = hashlib.sha1(url).hexdigest()
+
+    conn = _connectDb()
+    c = conn.cursor()
+    _createDb(c)
+
+    sql = 'SELECT %s FROM cache WHERE url_hash=?' % value
+    if timeout > 0:
+        now = time.mktime(time.localtime())
+        t = (url_hash, now-timeout)
+        sql += ' AND created > ?'
+    else:
+        t = (url_hash, )
+    if value != "headers":
+        sql += ' AND only_headers != 1 '
+    c.execute(sql, t)
+    for row in c:
+        r = row[0]
+        if value == 'data':
+            r = str(r)
+        break
+
+    c.close()
+    conn.close()
+    return r
+
+def _saveUrlCache(url, post_data, data, headers):
+    if post_data:
+        url_hash = hashlib.sha1(url + '?' + post_data).hexdigest()
+    else:
+        url_hash = hashlib.sha1(url).hexdigest()
+
+    domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
+
+    conn = _connectDb()
+    c = conn.cursor()
+
+    # Create table if not exists
+    _createDb(c)
+
+    # Insert a row of data
+    if not post_data: post_data=""
+    only_headers = 0
+    if data == -1:
+        only_headers = 1
+        data = ""
+    created = time.mktime(time.localtime())
+    t = (url_hash, domain, url, post_data, simplejson.dumps(headers), created, sqlite3.Binary(data), only_headers)
+    c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?)""", t)
+
+    # Save (commit) the changes and clean up
+    conn.commit()
+    c.close()
+    conn.close()
+
+def migrate_to_db():
+    import re
+    import os
+    import sqlite3
+    import glob
+
+    conn = _connectDb()
+    c = conn.cursor()
+    _createDb(c)
+
+    files = glob.glob(_getCacheBase() + "/*/*/*/*/*")
+    _files = filter(lambda x: not x.endswith(".headers"), files)
+
+    for f in _files:
+        info = re.compile("%s/(.*?)/../../../(.*)" % _getCacheBase()).findall(f)
+        domain = url = info[0][0]
+        url_hash = info[0][1]
+        post_data = ""
+        created = os.stat(f).st_ctime
+        fd = open(f, "r")
+        data = fd.read()
+        fd.close()
+        fd = open(f + ".headers", "r")
+        headers = fd.read()
+        fd.close()
+        t = (url_hash, domain, url, post_data, headers, created, sqlite3.Binary(data), 0)
+        c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?)""", t)
+
+    conn.commit()
+    c.close()
+    conn.close()
+
--- a/ox/file.py
+++ b/ox/file.py
@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+# GPL 2008
+from __future__ import division
+import os
+import hashlib
+import sys
+import struct
+import subprocess
+
+import simplejson
+
+__all__ = ['sha1sum', 'oshash', 'avinfo']
+
+def sha1sum(filename):
+    sha1 = hashlib.sha1()
+    file=open(filename)
+    buffer=file.read(4096)
+    while buffer:
+        sha1.update(buffer)
+        buffer=file.read(4096)
+    file.close()
+    return sha1.hexdigest()
+
+'''
+    os hash - http://trac.opensubtitles.org/projects/opensubtitles/wiki/HashSourceCodes
+    plus modification for files < 64k, buffer is filled with file data and padded with 0
+'''
+def oshash(filename):
+    try:
+        longlongformat = 'q'  # long long
+        bytesize = struct.calcsize(longlongformat)
+
+        f = open(filename, "rb")
+
+        filesize = os.path.getsize(filename)
+        hash = filesize
+        if filesize < 65536:
+            for x in range(int(filesize/bytesize)):
+                buffer = f.read(bytesize)
+                (l_value,)= struct.unpack(longlongformat, buffer)
+                hash += l_value
+                hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
+        else:
+            for x in range(int(65536/bytesize)):
+                buffer = f.read(bytesize)
+                (l_value,)= struct.unpack(longlongformat, buffer)
+                hash += l_value
+                hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
+            f.seek(max(0,filesize-65536),0)
+            for x in range(int(65536/bytesize)):
+                buffer = f.read(bytesize)
+                (l_value,)= struct.unpack(longlongformat, buffer)
+                hash += l_value
+                hash = hash & 0xFFFFFFFFFFFFFFFF
+        f.close()
+        returnedhash =  "%016x" % hash
+        return returnedhash
+    except(IOError):
+        return "IOError"
+
+def avinfo(filename):
+    if os.path.getsize(filename):
+        p = subprocess.Popen(['ffmpeg2theora', '--info', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        info, error = p.communicate()
+        return simplejson.loads(info)
+    return {'path': filename, 'size': 0}
--- a/ox/form.py
+++ b/ox/form.py
@ -0,0 +1,73 @@
+import itertools
+import mimetools
+import mimetypes
+
+
+__all__ = ['MultiPartForm']
+
+class MultiPartForm(object):
+    """Accumulate the data to be used when posting a form."""
+
+    def __init__(self):
+        self.form_fields = []
+        self.files = []
+        self.boundary = mimetools.choose_boundary()
+        return
+    
+    def get_content_type(self):
+        return 'multipart/form-data; boundary=%s' % self.boundary
+
+    def add_field(self, name, value):
+        """Add a simple field to the form data."""
+        self.form_fields.append((name, value))
+        return
+
+    def add_file(self, fieldname, filename, fileHandle, mimetype=None):
+        """Add a file to be uploaded."""
+        if hasattr(fileHandle, 'read'):
+            body = fileHandle.read()
+        else:
+            body = fileHandle
+        if mimetype is None:
+            mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
+        self.files.append((fieldname, filename, mimetype, body))
+        return
+    
+    def __str__(self):
+        """Return a string representing the form data, including attached files."""
+        # Build a list of lists, each containing "lines" of the
+        # request.  Each part is separated by a boundary string.
+        # Once the list is built, return a string where each
+        # line is separated by '\r\n'.  
+        parts = []
+        part_boundary = '--' + self.boundary
+        
+        # Add the form fields
+        parts.extend(
+            [ part_boundary,
+              'Content-Disposition: form-data; name="%s"' % name,
+              '',
+              value,
+            ]
+            for name, value in self.form_fields
+            )
+        
+        # Add the files to upload
+        parts.extend(
+            [ part_boundary,
+              'Content-Disposition: file; name="%s"; filename="%s"' % \
+                 (field_name, filename),
+              'Content-Type: %s' % content_type,
+              '',
+              body,
+            ]
+            for field_name, filename, content_type, body in self.files
+            )
+        
+        # Flatten the list and add closing boundary marker,
+        # then return CR+LF separated data
+        flattened = list(itertools.chain(*parts))
+        flattened.append('--' + self.boundary + '--')
+        flattened.append('')
+        return '\r\n'.join(flattened)
+
--- a/ox/format.py
+++ b/ox/format.py
@ -0,0 +1,350 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import math
+import re
+
+def to32(q):
+    """
+    Converts an integer to base 32
+    We exclude 4 of the 26 letters: I L O U.
+    http://www.crockford.com/wrmg/base32.html
+
+    >>> to32(35)
+    '13'
+    >>> to32(119292)
+    '3MgV'
+    >>> to32(939387374)
+    'wZwTgD'
+    >>> to32(0)
+    '0'
+    >>> to32(-393)
+    Traceback (most recent call last):
+        ...
+    ValueError: must supply a positive integer
+    """
+
+    if q < 0: raise ValueError, "must supply a positive integer"
+    letters = "0123456789ACBEDGFHKJMNQPSRTWVYXZ"
+    converted = []
+    upper = True
+    while q != 0:
+        q, r = divmod(q, 32)
+        l = letters[r]
+        if upper:
+            upper = False
+        else:
+            l = l.lower()
+            upper = True
+        converted.insert(0, l)
+    return "".join(converted) or '0'
+
+def from32(q):
+    _32map = {
+        '0': 0,
+        '1': 1,
+        '2': 2,
+        '3': 3,
+        '4': 4,
+        '5': 5,
+        '6': 6,
+        '7': 7,
+        '8': 8,
+        '9': 9,
+        'A': 10,
+        'B': 11,
+        'C': 12,
+        'D': 13,
+        'E': 14,
+        'F': 15,
+        'G': 16,
+        'H': 17,
+        'J': 18,
+        'K': 19,
+        'M': 20,
+        'N': 21,
+        'P': 22,
+        'Q': 23,
+        'R': 24,
+        'S': 25,
+        'T': 26,
+        'V': 27,
+        'W': 28,
+        'X': 29,
+        'Y': 30,
+        'Z': 31,
+        'O': 0,
+        'I': 1,
+        'L': 1,
+    }
+    base32 = '0123456789ABCDEFGHIJKLMNOPQRSTUV'
+    q = q.replace('-','')
+    q = ''.join([base32[_32map[i.upper()]] for i in q])
+    return int(q, 32)
+
+def to36(q):
+    """
+    Converts an integer to base 36 (a useful scheme for human-sayable IDs
+    like 'fuck' (739172), 'shit' (1329077) or 'hitler' (1059538851)).
+
+    >>> to36(35)
+    'z'
+    >>> to36(119292)
+    '2k1o'
+    >>> int(to36(939387374), 36)
+    939387374
+    >>> to36(0)
+    '0'
+    >>> to36(-393)
+    Traceback (most recent call last):
+        ...
+    ValueError: must supply a positive integer
+    """
+    if q < 0: raise ValueError, "must supply a positive integer"
+    letters = "0123456789abcdefghijklmnopqrstuvwxyz"
+    converted = []
+    while q != 0:
+        q, r = divmod(q, 36)
+        converted.insert(0, letters[r])
+    return "".join(converted) or '0'
+
+def from36(q):
+    return int(q, 36)
+
+def intValue(strValue, default=u''):
+    """
+    >>> intValue('abc23')
+    u'23'
+
+    >>> intValue(' abc23')
+    u'23'
+
+    >>> intValue('ab')
+    u''
+    """
+    try:
+        val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
+    except:
+        val = default
+    return val
+
+def floatValue(strValue, default=u''):
+    """
+    >>> floatValue('abc23.4')
+    u'23.4'
+
+    >>> floatValue(' abc23.4')
+    u'23.4'
+
+    >>> floatValue('ab')
+    u''
+    """
+    try:
+        val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
+    except:
+        val = default
+    return val
+
+def formatNumber(number, longName, shortName):
+    """
+    Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
+    
+    >>> formatNumber(123, 'Byte', 'B')
+    '123 Bytes'
+
+    >>> formatNumber(1234, 'Byte', 'B')
+    '1 KB'
+
+    >>> formatNumber(1234567, 'Byte', 'B')
+    '1.2 MB'
+
+    >>> formatNumber(1234567890, 'Byte', 'B')
+    '1.15 GB'
+
+    >>> formatNumber(1234567890123456789, 'Byte', 'B')
+    '1,096.5166 PB'
+
+    >>> formatNumber(-1234567890123456789, 'Byte', 'B')
+    '-1,096.5166 PB'
+
+    """
+    if abs(number) < 1024:
+        return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '')
+    prefix = ['K', 'M', 'G', 'T', 'P']
+    for i in range(5):
+        if abs(number) < math.pow(1024, i + 2) or i == 4:
+            n = number / math.pow(1024, i + 1)
+            return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName)
+
+def formatThousands(number, separator = ','):
+    """
+    Return the number with separators (1,000,000)
+    
+    >>> formatThousands(1)
+    '1'
+    >>> formatThousands(1000)
+    '1,000'
+    >>> formatThousands(1000000)
+    '1,000,000'
+    """
+    string = str(number).split('.')
+    l = []
+    for i, character in enumerate(reversed(string[0])):
+        if i and (not (i % 3)):
+            l.insert(0, separator)
+        l.insert(0, character)
+    string[0] = ''.join(l)
+    return '.'.join(string)
+
+def formatBits(number):
+    return formatNumber(number, 'bit', 'b')
+
+def formatBytes(number):
+    return formatNumber(number, 'byte', 'B')
+
+def formatPixels(number):
+    return formatNumber(number, 'pixel', 'px')
+
+def formatCurrency(amount, currency="$"):
+  if amount:
+    temp = "%.2f" % amount
+    profile=re.compile(r"(\d)(\d\d\d[.,])")
+    while 1:
+      temp, count = re.subn(profile,r"\1,\2",temp)
+      if not count:
+        break
+    if temp.startswith('-'):
+       return "-"+ currency + temp[1:-3]
+    return currency + temp[:-3]
+  else:
+    return ""
+
+def plural(amount, unit, plural='s'):
+    '''
+    >>> plural(1, 'unit')
+    '1 unit'
+    >>> plural(2, 'unit')
+    '2 units'
+    '''
+    if abs(amount) != 1:
+        if plural == 's':
+            unit = unit + plural
+        else: unit = plural
+    return "%s %s" % (formatThousands(amount), unit)
+
+def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
+    '''
+    verbosity
+        0: D:HH:MM:SS
+        1: Dd Hh Mm Ss
+        2: D days H hours M minutes S seconds
+    years
+        True: 366 days are 1 year 1 day
+        False: 366 days are 366 days
+    hours
+        True: 30 seconds are 00:00:30
+        False: 30 seconds are 00:30
+    milliseconds
+        True: always display milliseconds
+        False: never display milliseconds
+    >>> formatDuration(1000 * 60 * 60 * 24 * 366)
+    '1:001:00:00:00.000'
+    >>> formatDuration(1000 * 60 * 60 * 24 * 366, years=False)
+    '366:00:00:00.000'
+    >>> formatDuration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
+    '1 year 2 seconds 3 milliseconds'
+    >>> formatDuration(1000 * 30, hours=False, milliseconds=False)
+    '00:30'
+    '''
+    if not ms and ms != 0:
+        return ''
+    if years:
+        y = int(ms / 31536000000)
+        d = int(ms % 31536000000 / 86400000)
+    else:
+        d = int(ms / 86400000)
+    h = int(ms % 86400000 / 3600000)
+    m = int(ms % 3600000 / 60000)
+    s = int(ms % 60000 / 1000)
+    ms = ms % 1000
+    if verbosity == 0:
+        if years and y:
+            duration = "%d:%03d:%02d:%02d:%02d" % (y, d, h, m, s)
+        elif d:
+            duration = "%d:%02d:%02d:%02d" % (d, h, m, s)
+        elif hours or h:
+            duration = "%02d:%02d:%02d" % (h, m, s)
+        else:
+            duration = "%02d:%02d" % (m, s)
+        if milliseconds:
+            duration += ".%03d" % ms
+    else:
+        if verbosity == 1:
+            durations = ["%sd" % d, "%sh" % h,  "%sm" % m, "%ss" % s]
+            if years:
+                durations.insert(0, "%sy" % y)
+            if milliseconds:
+                durations.append("%sms" % ms)
+        else:
+            durations = [plural(d, 'day'), plural(h,'hour'),
+                plural(m, 'minute'), plural(s, 'second')]
+            if years:
+                durations.insert(0, plural(y, 'year'))
+            if milliseconds:
+                durations.append(plural(ms, 'millisecond'))
+        durations = filter(lambda x: not x.startswith('0'), durations)
+        duration = ' '.join(durations)
+    return duration
+
+def ms2runtime(ms, shortenLong=False):
+    # deprecated - use formatDuration
+    '''
+    >>> ms2runtime(5000)
+    '5 seconds'
+    >>> ms2runtime(500000)
+    '8 minutes 20 seconds'
+    >>> ms2runtime(50000000)
+    '13 hours 53 minutes 20 seconds'
+    >>> ms2runtime(50000000-20000)
+    '13 hours 53 minutes'
+    '''
+    if shortenLong and ms > 1000 * 60 * 60 * 24 * 464:
+        return formatDuration(ms, verbosity=1, milliseconds=False)
+    return formatDuration(ms, verbosity=2, milliseconds=False)
+
+def ms2playtime(ms, hours=False):
+    # deprecated - use formatDuration
+    '''
+    >>> ms2playtime(5000)
+    '00:05'
+    >>> ms2playtime(500000)
+    '08:20'
+    >>> ms2playtime(50000000)
+    '13:53:20'
+    '''
+    return formatDuration(ms, hours=False, years=False, milliseconds=False)
+
+def ms2time(ms):
+    # deprecated - use formatDuration
+    '''
+    >>> ms2time(44592123)
+    '12:23:12.123'
+    '''
+    return formatDuration(ms, years=False)
+
+def time2ms(timeString):
+    '''
+    >>> time2ms('12:23:12.123')
+    44592123
+    '''
+    ms = 0.0
+    p = timeString.split(':')
+    for i in range(len(p)):
+        _p = p[i]
+        if _p.endswith('.'): _p =_p[:-1]
+        ms = ms * 60 + float(_p)
+    return int(ms * 1000)
+
+def shiftTime(offset, timeString):
+    newTime = time2ms(timeString) + offset
+    return ms2time(newTime)
+
--- a/ox/html.py
+++ b/ox/html.py
@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+# GPL 2008
+import re
+import string
+from htmlentitydefs import name2codepoint
+
+
+# Configuration for urlize() function
+LEADING_PUNCTUATION  = ['(', '<', '&lt;']
+TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;', "'", '"']
+
+# list of possible strings used for bullets in bulleted lists
+DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
+
+unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
+word_split_re = re.compile(r'(\s+)')
+punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
+    ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
+    '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
+simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
+link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
+html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
+hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
+trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
+del x # Temporary variable
+
+def escape(html):
+    '''
+    Returns the given HTML with ampersands, quotes and carets encoded
+
+    >>> escape('html "test" & <brothers>')
+    'html &quot;test&quot; &amp; &lt;brothers&gt;'
+    '''
+    if not isinstance(html, basestring):
+          html = str(html)
+    return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')
+
+def linebreaks(value):
+    '''
+    Converts newlines into <p> and <br />
+    '''
+    value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
+    paras = re.split('\n{2,}', value)
+    paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
+    return '\n\n'.join(paras)
+
+def stripTags(value):
+    """
+    Returns the given HTML with all tags stripped
+    
+    >>> stripTags('some <h2>title</h2> <script>asdfasdf</script>')
+    'some title asdfasdf'
+    """
+    return re.sub(r'<[^>]*?>', '', value)
+    
+def stripSpacesBetweenTags(value):
+    "Returns the given HTML with spaces between tags normalized to a single space"
+    return re.sub(r'>\s+<', '> <', value)
+
+def stripEntities(value):
+    "Returns the given HTML with all entities (&something;) stripped"
+    return re.sub(r'&(?:\w+|#\d);', '', value)
+
+def fixAmpersands(value):
+    "Returns the given HTML with all unencoded ampersands encoded correctly"
+    return unencoded_ampersands_re.sub('&amp;', value)
+
+def urlize(text, trim_url_limit=None, nofollow=False):
+    """
+    Converts any URLs in text into clickable links. Works on http://, https:// and
+    www. links. Links can have trailing punctuation (periods, commas, close-parens)
+    and leading punctuation (opening parens) and it'll still do the right thing.
+
+    If trim_url_limit is not None, the URLs in link text will be limited to
+    trim_url_limit characters.
+
+    If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
+    """
+    trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or ''))  or x
+    words = word_split_re.split(text)
+    nofollow_attr = nofollow and ' rel="nofollow"' or ''
+    for i, word in enumerate(words):
+        match = punctuation_re.match(word)
+        if match:
+            lead, middle, trail = match.groups()
+            if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
+                    len(middle) > 0 and middle[0] in string.letters + string.digits and \
+                    (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
+                middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
+            if middle.startswith('http://') or middle.startswith('https://'):
+                middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
+            if '@' in middle and not middle.startswith('www.') and not ':' in middle \
+              and simple_email_re.match(middle):
+                middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
+            if lead + middle + trail != word:
+                words[i] = lead + middle + trail
+    return ''.join(words)
+
+def cleanHtml(text):
+    """
+    Cleans the given HTML. Specifically, it does the following:
+        * Converts <b> and <i> to <strong> and <em>.
+        * Encodes all ampersands correctly.
+        * Removes all "target" attributes from <a> tags.
+        * Removes extraneous HTML, such as presentational tags that open and
+          immediately close and <br clear="all">.
+        * Converts hard-coded bullets into HTML unordered lists.
+        * Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
+          bottom of the text.
+    """
+    from text import normalizeNewlines
+    text = normalizeNewlines(text)
+    text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
+    text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
+    text = fixAmpersands(text)
+    # Remove all target="" attributes from <a> tags.
+    text = link_target_attribute_re.sub('\\1', text)
+    # Trim stupid HTML such as <br clear="all">.
+    text = html_gunk_re.sub('', text)
+    # Convert hard-coded bullets into HTML unordered lists.
+    def replace_p_tags(match):
+        s = match.group().replace('</p>', '</li>')
+        for d in DOTS:
+            s = s.replace('<p>%s' % d, '<li>')
+        return '<ul>\n%s\n</ul>' % s
+    text = hard_coded_bullets_re.sub(replace_p_tags, text)
+    # Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom of the text.
+    text = trailing_empty_content_re.sub('', text)
+    return text
+
+# This pattern matches a character entity reference (a decimal numeric
+# references, a hexadecimal numeric reference, or a named reference).
+charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
+
+def decodeHtml(html):
+    """
+    >>> decodeHtml('me &amp; you and &#36;&#38;%')
+    u'me & you and $&%'
+    """
+    if type(html) != unicode:
+        html = unicode(html)[:]
+    if type(html) is unicode:
+        uchr = unichr
+    else:
+        uchr = lambda value: value > 255 and unichr(value) or chr(value)
+    def entitydecode(match, uchr=uchr):
+        entity = match.group(1)
+        if entity.startswith('#x'):
+            return uchr(int(entity[2:], 16))
+        elif entity.startswith('#'):
+            return uchr(int(entity[1:]))
+        elif entity in name2codepoint:
+            return uchr(name2codepoint[entity])
+        else:
+            return match.group(0)
+    return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
+
+def highlight(text, query, hlClass="hl"):
+    """
+    >>> highlight('me &amp; you and &#36;&#38;%', 'and')
+    'me &amp; you <span class="hl">and</span> &#36;&#38;%'
+    """
+    if query:
+        text = text.replace('<br />', '|')
+        query = re.escape(query).replace('\ ', '.')
+        m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
+        for i in m:
+            text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
+        text = text.replace('|', '<br />')
+    return text
+
--- a/ox/iso.py
+++ b/ox/iso.py
@ -0,0 +1,243 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+# GPL 2008
+
+_iso639_languages = [
+    ("Unknown", "", "", "und"),
+    ("Afar", "", "aa", "aar"),
+    ("Abkhazian", "", "ab", "abk"),
+    ("Afrikaans", "", "af", "afr"),
+    ("Akan", "", "ak", "aka"),
+    ("Albanian", "", "sq", "sqi"),
+    ("Amharic", "", "am", "amh"),
+    ("Arabic", "", "ar", "ara"),
+    ("Aragonese", "", "an", "arg"),
+    ("Armenian", "", "hy", "hye"),
+    ("Assamese", "", "as", "asm"),
+    ("Avaric", "", "av", "ava"),
+    ("Avestan", "", "ae", "ave"),
+    ("Aymara", "", "ay", "aym"),
+    ("Azerbaijani", "", "az", "aze"),
+    ("Bashkir", "", "ba", "bak"),
+    ("Bambara", "", "bm", "bam"),
+    ("Basque", "", "eu", "eus"),
+    ("Belarusian", "", "be", "bel"),
+    ("Bengali", "", "bn", "ben"),
+    ("Bihari", "", "bh", "bih"),
+    ("Bislama", "", "bi", "bis"),
+    ("Bosnian", "", "bs", "bos"),
+    ("Breton", "", "br", "bre"),
+    ("Bulgarian", "", "bg", "bul"),
+    ("Burmese", "", "my", "mya"),
+    ("Catalan", "", "ca", "cat"),
+    ("Chamorro", "", "ch", "cha"),
+    ("Chechen", "", "ce", "che"),
+    ("Chinese", "", "zh", "zho"),
+    ("Church Slavic", "", "cu", "chu"),
+    ("Chuvash", "", "cv", "chv"),
+    ("Cornish", "", "kw", "cor"),
+    ("Corsican", "", "co", "cos"),
+    ("Cree", "", "cr", "cre"),
+    ("Czech", "", "cs", "ces"),
+    ("Danish", "Dansk", "da", "dan"),
+    ("Divehi", "", "dv", "div"),
+    ("Dutch", "Nederlands", "nl", "nld"),
+    ("Dzongkha", "", "dz", "dzo"),
+    ("English", "English", "en", "eng"),
+    ("Esperanto", "", "eo", "epo"),
+    ("Estonian", "", "et", "est"),
+    ("Ewe", "", "ee", "ewe"),
+    ("Faroese", "", "fo", "fao"),
+    ("Fijian", "", "fj", "fij"),
+    ("Finnish", "Suomi", "fi", "fin"),
+    ("French", "Francais", "fr", "fra"),
+    ("Western Frisian", "", "fy", "fry"),
+    ("Fulah", "", "ff", "ful"),
+    ("Georgian", "", "ka", "kat"),
+    ("German", "Deutsch", "de", "deu"),
+    ("Gaelic (Scots)", "", "gd", "gla"),
+    ("Irish", "", "ga", "gle"),
+    ("Galician", "", "gl", "glg"),
+    ("Manx", "", "gv", "glv"),
+    ("Greek, Modern", "", "el", "ell"),
+    ("Guarani", "", "gn", "grn"),
+    ("Gujarati", "", "gu", "guj"),
+    ("Haitian", "", "ht", "hat"),
+    ("Hausa", "", "ha", "hau"),
+    ("Hebrew", "", "he", "heb"),
+    ("Herero", "", "hz", "her"),
+    ("Hindi", "", "hi", "hin"),
+    ("Hiri Motu", "", "ho", "hmo"),
+    ("Hungarian", "Magyar", "hu", "hun"),
+    ("Igbo", "", "ig", "ibo"),
+    ("Icelandic", "Islenska", "is", "isl"),
+    ("Ido", "", "io", "ido"),
+    ("Sichuan Yi", "", "ii", "iii"),
+    ("Inuktitut", "", "iu", "iku"),
+    ("Interlingue", "", "ie", "ile"),
+    ("Interlingua", "", "ia", "ina"),
+    ("Indonesian", "", "id", "ind"),
+    ("Inupiaq", "", "ik", "ipk"),
+    ("Italian", "Italiano", "it", "ita"),
+    ("Javanese", "", "jv", "jav"),
+    ("Japanese", "", "ja", "jpn"),
+    ("Kalaallisut (Greenlandic)", "", "kl", "kal"),
+    ("Kannada", "", "kn", "kan"),
+    ("Kashmiri", "", "ks", "kas"),
+    ("Kanuri", "", "kr", "kau"),
+    ("Kazakh", "", "kk", "kaz"),
+    ("Central Khmer", "", "km", "khm"),
+    ("Kikuyu", "", "ki", "kik"),
+    ("Kinyarwanda", "", "rw", "kin"),
+    ("Kirghiz", "", "ky", "kir"),
+    ("Komi", "", "kv", "kom"),
+    ("Kongo", "", "kg", "kon"),
+    ("Korean", "", "ko", "kor"),
+    ("Kuanyama", "", "kj", "kua"),
+    ("Kurdish", "", "ku", "kur"),
+    ("Lao", "", "lo", "lao"),
+    ("Latin", "", "la", "lat"),
+    ("Latvian", "", "lv", "lav"),
+    ("Limburgan", "", "li", "lim"),
+    ("Lingala", "", "ln", "lin"),
+    ("Lithuanian", "", "lt", "lit"),
+    ("Luxembourgish", "", "lb", "ltz"),
+    ("Luba-Katanga", "", "lu", "lub"),
+    ("Ganda", "", "lg", "lug"),
+    ("Macedonian", "", "mk", "mkd"),
+    ("Marshallese", "", "mh", "mah"),
+    ("Malayalam", "", "ml", "mal"),
+    ("Maori", "", "mi", "mri"),
+    ("Marathi", "", "mr", "mar"),
+    ("Malay", "", "ms", "msa"),
+    ("Malagasy", "", "mg", "mlg"),
+    ("Maltese", "", "mt", "mlt"),
+    ("Moldavian", "", "mo", "mol"),
+    ("Mongolian", "", "mn", "mon"),
+    ("Nauru", "", "na", "nau"),
+    ("Navajo", "", "nv", "nav"),
+    ("Ndebele, South", "", "nr", "nbl"),
+    ("Ndebele, North", "", "nd", "nde"),
+    ("Ndonga", "", "ng", "ndo"),
+    ("Nepali", "", "ne", "nep"),
+    ("Norwegian Nynorsk", "", "nn", "nno"),
+    ("Norwegian Bokmål", "", "nb", "nob"),
+    ("Norwegian", "Norsk", "no", "nor"),
+    ("Chichewa; Nyanja", "", "ny", "nya"),
+    ("Occitan (post 1500); Provençal", "", "oc", "oci"),
+    ("Ojibwa", "", "oj", "oji"),
+    ("Oriya", "", "or", "ori"),
+    ("Oromo", "", "om", "orm"),
+    ("Ossetian; Ossetic", "", "os", "oss"),
+    ("Panjabi", "", "pa", "pan"),
+    ("Persian", "", "fa", "fas"),
+    ("Pali", "", "pi", "pli"),
+    ("Polish", "", "pl", "pol"),
+    ("Portuguese", "Portugues", "pt", "por"),
+    ("Pushto", "", "ps", "pus"),
+    ("Quechua", "", "qu", "que"),
+    ("Romansh", "", "rm", "roh"),
+    ("Romanian", "", "ro", "ron"),
+    ("Rundi", "", "rn", "run"),
+    ("Russian", "", "ru", "rus"),
+    ("Sango", "", "sg", "sag"),
+    ("Sanskrit", "", "sa", "san"),
+    ("Serbian", "", "sr", "srp"),
+    ("Croatian", "Hrvatski", "hr", "hrv"),
+    ("Sinhala", "", "si", "sin"),
+    ("Slovak", "", "sk", "slk"),
+    ("Slovenian", "", "sl", "slv"),
+    ("Northern Sami", "", "se", "sme"),
+    ("Samoan", "", "sm", "smo"),
+    ("Shona", "", "sn", "sna"),
+    ("Sindhi", "", "sd", "snd"),
+    ("Somali", "", "so", "som"),
+    ("Sotho, Southern", "", "st", "sot"),
+    ("Spanish", "Espanol", "es", "spa"),
+    ("Sardinian", "", "sc", "srd"),
+    ("Swati", "", "ss", "ssw"),
+    ("Sundanese", "", "su", "sun"),
+    ("Swahili", "", "sw", "swa"),
+    ("Swedish", "Svenska", "sv", "swe"),
+    ("Tahitian", "", "ty", "tah"),
+    ("Tamil", "", "ta", "tam"),
+    ("Tatar", "", "tt", "tat"),
+    ("Telugu", "", "te", "tel"),
+    ("Tajik", "", "tg", "tgk"),
+    ("Tagalog", "", "tl", "tgl"),
+    ("Thai", "", "th", "tha"),
+    ("Tibetan", "", "bo", "bod"),
+    ("Tigrinya", "", "ti", "tir"),
+    ("Tonga (Tonga Islands)", "", "to", "ton"),
+    ("Tswana", "", "tn", "tsn"),
+    ("Tsonga", "", "ts", "tso"),
+    ("Turkmen", "", "tk", "tuk"),
+    ("Turkish", "", "tr", "tur"),
+    ("Twi", "", "tw", "twi"),
+    ("Uighur", "", "ug", "uig"),
+    ("Ukrainian", "", "uk", "ukr"),
+    ("Urdu", "", "ur", "urd"),
+    ("Uzbek", "", "uz", "uzb"),
+    ("Venda", "", "ve", "ven"),
+    ("Vietnamese", "", "vi", "vie"),
+    ("Volapük", "", "vo", "vol"),
+    ("Welsh", "", "cy", "cym"),
+    ("Walloon", "", "wa", "wln"),
+    ("Wolof", "", "wo", "wol"),
+    ("Xhosa", "", "xh", "xho"),
+    ("Yiddish", "", "yi", "yid"),
+    ("Yoruba", "", "yo", "yor"),
+    ("Zhuang", "", "za", "zha"),
+    ("Zulu", "", "zu", "zul"),
+]
+
+def codeToLang(code):
+    code = code.lower()
+    if len(code) == 2:
+        for l in _iso639_languages:
+            if l[2] == code:
+                return l[0]
+    elif len(code) == 3:
+        for l in _iso639_languages:
+            if l[3] == code:
+                return l[0]
+    return None
+
+def langTo3Code(lang):
+    lang = langEnglishName(lang)
+    if lang:
+        lang=lang.lower()
+    for l in _iso639_languages:
+        if l[0].lower() == lang:
+            return l[3]
+    return None
+
+def langTo2Code(lang):
+    lang = langEnglishName(lang)
+    if lang:
+        lang=lang.lower()
+    for l in _iso639_languages:
+        if l[0].lower() == lang:
+            return l[2]
+    return None
+
+def langCode2To3(code):
+    langTo3Code(codeToLang(code))
+
+def langCode3To2(code):
+    langTo2Code(codeToLang(code))
+
+def langEnglishName(lang):
+    lang = lang.lower()
+    for l in _iso639_languages:
+        if l[1].lower() == lang:
+            return l[0]
+    return None
+
+def languages2Letter():
+    languages = []
+    for l in _iso639_languages:
+        if l[2]:
+            languages.append(l[2])
+    return languages
+
--- a/ox/net.py
+++ b/ox/net.py
@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+# GPL 2008
+import os
+import gzip
+import StringIO
+import urllib
+import urllib2
+
+from chardet.universaldetector import UniversalDetector
+
+
+# Default headers for HTTP requests.
+DEFAULT_HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (X11; U; Linux i386; en-US; rv:1.9.1.1) Gecko/20090716 Firefox/3.5',
+    'Accept-Encoding': 'gzip'
+}
+
+def status(url, data=None, headers=DEFAULT_HEADERS):
+    try:
+        f = openUrl(url, data, headers)
+        s = f.code
+    except urllib2.HTTPError, e:
+        s = e.code
+    return s
+
+def exists(url, data=None, headers=DEFAULT_HEADERS):
+    s = status(url, data, headers)
+    if s >= 200 and s < 400:
+        return True
+    return False
+
+def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
+    try:
+        f = openUrl(url, data, headers)
+        f.headers['Status'] = "%s" % f.code
+        headers = f.headers
+        f.close()
+    except urllib2.HTTPError, e:
+        e.headers['Status'] = "%s" % e.code
+        headers = e.headers
+    return dict(headers)
+
+def openUrl(url, data=None, headers=DEFAULT_HEADERS):
+    url = url.replace(' ', '%20')
+    req = urllib2.Request(url, data, headers)
+    return urllib2.urlopen(req)
+
+def readUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
+    f = openUrl(url, data, headers)
+    data = f.read()
+    f.close()
+    if f.headers.get('content-encoding', None) == 'gzip':
+        data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
+    if returnHeaders:
+        f.headers['Status'] = "%s" % f.code
+        return dict(f.headers), data
+    return data
+
+def readUrlUnicode(url):
+    data = readUrl(url)
+    encoding = getEncoding(data)
+    if not encoding:
+        encoding = 'latin-1'
+    return unicode(data, encoding)
+
+def getEncoding(data):
+    if 'content="text/html; charset=utf-8"' in data:
+        return 'utf-8'
+    elif 'content="text/html; charset=iso-8859-1"' in data:
+        return 'iso-8859-1'
+    detector = UniversalDetector()
+    for line in data.split('\n'):
+        detector.feed(line)
+        if detector.done:
+            break
+    detector.close()
+    return detector.result['encoding']
+
+def saveUrl(url, filename, overwrite=False):
+    if not os.path.exists(filename) or overwrite:
+        dirname = os.path.dirname(filename)
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+        data = readUrl(url)
+        f = open(filename, 'w')
+        f.write(data)
+        f.close()
+
--- a/ox/normalize.py
+++ b/ox/normalize.py
@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+# GPL 2008
+import re
+
+_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
+             "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
+             'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
+             'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
+             'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
+             u'\xd4\xef', u'\xcf\xe9')
+
+# Articles in a dictionary.
+_articlesDict = dict([(x, x) for x in _articles])
+_spArticles = []
+for article in _articles:
+    if article[-1] not in ("'", '-'): article += ' '
+    _spArticles.append(article)
+
+def canonicalTitle(title):
+    """Return the title in the canonic format 'Movie Title, The'.
+    
+    >>> canonicalTitle('The Movie Title')
+    'Movie Title, The'
+    """
+    try:
+        if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
+    except IndexError: pass
+    ltitle = title.lower()
+    for article in _spArticles:
+        if ltitle.startswith(article):
+            lart = len(article)
+            title = '%s, %s' % (title[lart:], title[:lart])
+            if article[-1] == ' ': title = title[:-1]
+            break
+    ## XXX: an attempt using a dictionary lookup.
+    ##for artSeparator in (' ', "'", '-'):
+    ##    article = _articlesDict.get(ltitle.split(artSeparator)[0])
+    ##    if article is not None:
+    ##        lart = len(article)
+    ##        # check titles like "una", "I'm Mad" and "L'abbacchio".
+    ##        if title[lart:] == '' or (artSeparator != ' ' and
+    ##                                title[lart:][1] != artSeparator): continue
+    ##        title = '%s, %s' % (title[lart:], title[:lart])
+    ##        if artSeparator == ' ': title = title[1:]
+    ##        break
+    return title
+
+def normalizeTitle(title):
+    """Return the title in the normal "The Title" format.
+
+    >>> normalizeTitle('Movie Title, The')
+    'The Movie Title'
+    """
+    stitle = title.split(', ')
+    if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
+        sep = ' '
+        if stitle[-1][-1] in ("'", '-'): sep = ''
+        title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
+    return title
+
+def normalizeImdbId(imdbId):
+    """Return 7 digit imdbId.
+
+    >>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
+    '0159206'
+    >>> normalizeImdbId(159206)
+    '0159206'
+    >>> normalizeImdbId('tt0159206')
+    '0159206'
+    """
+    if isinstance(imdbId, basestring):
+        imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
+    elif isinstance(imdbId, int):
+        imdbId = "%07d" % imdbId
+    return imdbId
+
+
+# Common suffixes in surnames.
+_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
+                    'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')
+
+def canonicalName(name):
+    """Return the given name in canonical "Surname, Name" format.
+    It assumes that name is in the 'Name Surname' format.
+    
+    >>> canonicalName('Jean Luc Godard')
+    'Godard, Jean Luc'
+
+    >>> canonicalName('Ivan Ivanov-Vano')
+    'Ivanov-Vano, Ivan'
+
+    >>> canonicalName('Gus Van Sant')
+    'Van Sant, Gus'
+
+    >>> canonicalName('Brian De Palma')
+    'De Palma, Brian'
+    """
+
+    # XXX: some statistics (over 1852406 names):
+    #      - just a surname:                 51921
+    #      - single surname, single name:  1792759
+    #      - composed surname, composed name: 7726
+    #      - composed surname, single name:  55623
+    #        (2: 49259, 3: 5502, 4: 551)
+    #      - single surname, composed name: 186604
+    #        (2: 178315, 3: 6573, 4: 1219, 5: 352)
+    # Don't convert names already in the canonical format.
+    if name in ('Unknown Director', ):
+        return name
+    if name.find(', ') != -1: return name
+    sname = name.split(' ')
+    snl = len(sname)
+    if snl == 2:
+        # Just a name and a surname: how boring...
+        name = '%s, %s' % (sname[1], sname[0])
+    elif snl > 2:
+        lsname = [x.lower() for x in sname]
+        if snl == 3: _indexes = (0, snl-2)
+        else: _indexes = (0, snl-2, snl-3)
+        # Check for common surname prefixes at the beginning and near the end.
+        for index in _indexes:
+            if lsname[index] not in _sname_suffixes: continue
+            try:
+                # Build the surname.
+                surn = '%s %s' % (sname[index], sname[index+1])
+                del sname[index]
+                del sname[index]
+                try:
+                    # Handle the "Jr." after the name.
+                    if lsname[index+2].startswith('jr'):
+                        surn += ' %s' % sname[index]
+                        del sname[index]
+                except (IndexError, ValueError):
+                    pass
+                name = '%s, %s' % (surn, ' '.join(sname))
+                break
+            except ValueError:
+                continue
+        else:
+            name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
+    return name
+
+def normalizeName(name):
+    """Return a name in the normal "Name Surname" format.
+    
+    >>> normalizeName('Godard, Jean Luc')
+    'Jean Luc Godard'
+
+    >>> normalizeName('Ivanov-Vano, Ivan')
+    'Ivan Ivanov-Vano'
+
+    >>> normalizeName('Van Sant, Gus')
+    'Gus Van Sant'
+
+    >>> normalizeName('De Palma, Brian')
+    'Brian De Palma'
+    """
+    sname = name.split(', ')
+    if len(sname) == 2:
+        name = '%s %s' % (sname[1], sname[0])
+    return name
+
--- a/ox/text.py
+++ b/ox/text.py
@ -0,0 +1,270 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+# GPL 2008
+import math
+import re
+
+
+def findRe(string, regexp):
+    result = re.compile(regexp, re.DOTALL).findall(string)
+    if result:
+        return result[0].strip()
+    return ''
+
+def findString(string, string0='', string1 = ''):
+    """Return the string between string0 and string1. 
+
+    If string0 or string1 is left out, begining or end of string is used.
+
+    >>> findString('i am not there', string1=' not there')
+    'i am'
+
+    >>> findString('i am not there', 'i am ', ' there')
+    'not'
+
+    >>> findString('i am not there', 'i am not t')
+    'here'
+
+    """
+    if string0:
+        string0 = re.escape(string0)
+    else:
+        string0 = '^'
+    if string1:
+        string1 = re.escape(string1)
+    else:
+        string1 = '$'
+    return findRe(string, string0 + '(.*?)' + string1)
+
+def removeSpecialCharacters(text):
+    """
+    Removes special characters inserted by Word.
+    """
+    text = text.replace(u'\u2013', '-')
+    text = text.replace(u'\u2026O', "'")
+    text = text.replace(u'\u2019', "'")
+    text = text.replace(u'', "'")
+    text = text.replace(u'', "'")
+    text = text.replace(u'', "-")
+    return text
+
+def wrap(text, width):
+    """
+    A word-wrap function that preserves existing line breaks and most spaces in
+    the text. Expects that existing line breaks are posix newlines (\n).
+    See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
+    """
+    return reduce(lambda line, word, width=width: '%s%s%s' %
+                  (line,
+                    ' \n'[(len(line[line.rfind('\n')+1:])
+                          + len(word.split('\n',1)[0]
+                              ) >= width)],
+                    word),
+                  text.split(' ')
+                  )
+
+def wrapString(string, length=80, separator='\n', balance=False):
+    '''
+    >>> wrapString(u"Anticonstitutionellement, Paris s'eveille", 16)
+    u"Anticonstitution\\nellement, Paris \\ns'eveille"
+    >>> wrapString(u'All you can eat', 12, '\\n', True)
+    u'All you \\ncan eat'
+    '''
+    words = string.split(' ')
+    if balance:
+        # balance lines: test if same number of lines
+        # can be achieved with a shorter line length
+        lines = wrapString(string, length, separator, False).split(separator)
+        if len(lines) > 1:
+            while length > max(map(lambda x : len(x), words)):
+                length -= 1
+                if len(wrapString(string, length, separator, False).split(separator)) > len(lines):
+                    length += 1
+                    break
+    lines = ['']
+    for word in words:
+        if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
+            # word fits in current line
+            lines[len(lines) - 1] += word + u' ';
+        else:
+            if len(word) <= length:
+                # word fits in next line
+                lines.append(word + u' ')
+            else:
+                # word is longer than line
+                position = length - len(lines[len(lines) - 1])
+                lines[len(lines) - 1] += word[0:position]
+                for i in range(position, len(word), length):
+                    lines.append(word[i:i+length]);
+                lines[len(lines) - 1] += u' '
+    return separator.join(lines).strip()
+
+def truncateString(string, length, padding='...', position='right'):
+    #  >>> truncateString('anticonstitutionellement', 16, '...', 'left')
+    #  '...utionellement'
+    #  >>> truncateString('anticonstitutionellement', 16, '...', 'center')
+    #  'anticon...lement'
+    #  >>> truncateString('anticonstitutionellement', 16, '...', 'right')
+    #  'anticonstitut...'
+    stringLength = len(string);
+    paddingLength = len(padding)
+    if stringLength > length:
+        if position == 'left':
+            string = '%s%s' % (padding, string[stringLength + paddingLength - length:])
+        elif position == 'center':
+            left = int(math.ceil(float(length - paddingLength) / 2))
+            right = int(stringLength - math.floor(float(length - paddingLength) / 2))
+            string = '%s%s%s' % (string[:left], padding, string[right:])
+        elif position == 'right':
+            string = '%s%s' % (string[:length - paddingLength], padding)
+    return string;
+
+def truncateWords(s, num):
+    """Truncates a string after a certain number of chacters, but ends with a word
+
+    >>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23)
+    'Truncates a string...'
+    >>> truncateString('Truncates a string', 23)
+    'Truncates a string'
+
+    """
+    length = int(num)
+    if len(s) <= length:
+        return s
+    words = s.split()
+    ts = ""
+    while words and len(ts) + len(words[0]) < length:
+        ts += " " + words.pop(0)
+    if words:
+        ts += "..."
+    return ts.strip()
+
+def trimString(string, num):
+    """Truncates a string after a certain number of chacters, adding ... at -10 characters
+
+    >>> trimString('Truncates a string after a certain number of chacters', 23)
+    'Truncates ...f chacters'
+    >>> trimString('Truncates a string', 23)
+    'Truncates a string'
+    """
+    if len(string) > num:
+        string = string[:num - 13] + '...' + string[-10:]
+    return string
+
+def truncateWords(s, num):
+    "Truncates a string after a certain number of words."
+    length = int(num)
+    words = s.split()
+    if len(words) > length:
+        words = words[:length]
+        if not words[-1].endswith('...'):
+            words.append('...')
+    return ' '.join(words)
+
+def getValidFilename(s):
+    """
+    Returns the given string converted to a string that can be used for a clean
+    filename. Specifically, leading and trailing spaces are removed; 
+    all non-filename-safe characters are removed.
+
+    >>> getValidFilename("john's portrait in 2004.jpg")
+    'john_s_portrait_in_2004.jpg'
+    """
+    s = s.strip()
+    s = s.replace(' ', '_')
+    s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
+    s = s.replace('__', '_').replace('__', '_')
+    return s
+
+def getTextList(list_, last_word='or'):
+    """
+    >>> getTextList([u'a', u'b', u'c', u'd'])
+    u'a, b, c or d'
+    >>> getTextList([u'a', u'b', u'c'], 'and')
+    u'a, b and c'
+    >>> getTextList([u'a', u'b'], 'and')
+    u'a and b'
+    >>> getTextList([u'a'])
+    u'a'
+    >>> getTextList([])
+    ''
+    """
+    if len(list_) == 0: return ''
+    if len(list_) == 1: return list_[0]
+    return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
+
+def getListText(text, last_word='or'):
+    """
+    >>> getListText(u'a, b, c or d')
+    [u'a', u'b', u'c', u'd']
+    >>> getListText(u'a, b and c', u'and')
+    [u'a', u'b', u'c']
+    >>> getListText(u'a and b', u'and')
+    [u'a', u'b']
+    >>> getListText(u'a')
+    [u'a']
+    >>> getListText(u'')
+    []
+    """
+    list_ = []
+    if text:
+        list_ = text.split(u', ')
+        if list_:
+            i=len(list_)-1
+            last = list_[i].split(last_word)
+            if len(last) == 2:
+                list_[i] = last[0].strip()
+                list_.append(last[1].strip())
+    return list_
+
+def normalizeNewlines(text):
+    return re.sub(r'\r\n|\r|\n', '\n', text)
+
+def recapitalize(text):
+    "Recapitalizes text, placing caps after end-of-sentence punctuation."
+    #capwords = ()
+    text = text.lower()
+    capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
+    text = capsRE.sub(lambda x: x.group(1).upper(), text)
+    #for capword in capwords:
+    #    capwordRE = re.compile(r'\b%s\b' % capword, re.I)
+    #    text = capwordRE.sub(capword, text)
+    return text
+
+def phone2numeric(phone):
+    "Converts a phone number with letters into its numeric equivalent."
+    letters = re.compile(r'[A-PR-Y]', re.I)
+    char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
+          'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
+          'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
+          's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
+          'y': '9', 'x': '9'}.get(m.group(0).lower())
+    return letters.sub(char2number, phone)
+
+def compressString(s):
+    import cStringIO, gzip
+    zbuf = cStringIO.StringIO()
+    zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
+    zfile.write(s)
+    zfile.close()
+    return zbuf.getvalue()
+
+smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
+def smartSplit(text):
+    """
+    Generator that splits a string by spaces, leaving quoted phrases together.
+    Supports both single and double quotes, and supports escaping quotes with
+    backslashes. In the output, strings will keep their initial and trailing
+    quote marks.
+    >>> list(smartSplit('This is "a person\\'s" test.'))
+    ['This', 'is', '"a person\\'s"', 'test.']
+    """
+    for bit in smart_split_re.finditer(text):
+        bit = bit.group(0)
+        if bit[0] == '"':
+            yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
+        elif bit[0] == "'":
+            yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
+        else:
+            yield bit
+
--- a/ox/torrent/init.py
+++ b/ox/torrent/init.py
@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+# GPL 2007-2009
+
+from threading import Event
+import hashlib
+import os
+
+from bencode import bencode, bdecode
+
+__all__ = ['createTorrent', 'getInfoHash', 'getTorrentInfoFromFile', 'getTorrentInfo', 'getFiles', 'getTorrentSize']
+
+def createTorrent(file, url, params = {}, flag = Event(),
+                   progress = lambda x: None, progress_percent = 1):
+    "Creates a torrent for a given file, using url as tracker url"
+    from makemetafile import make_meta_file
+    return make_meta_file(file, url, params, flag, progress, progress_percent)
+
+def getInfoHash(torrentFile):
+    "Returns Torrent Info Hash from torrent file"
+    metainfo_file = open(torrentFile, 'rb')
+    metainfo = bdecode(metainfo_file.read())
+    info = metainfo['info']
+    return hashlib.sha1(bencode(info)).hexdigest()
+
+def getTorrentInfoFromFile(torrentFile):
+    f = open(torrentFile, 'rb')
+    data = f.read()
+    f.close()
+    tinfo = getTorrentInfo(data)
+    tinfo['timestamp'] = os.stat(torrentFile).st_ctime
+    return tinfo
+
+def getTorrentInfo(data):
+    "Returns Torrent Info from torrent file"
+    tinfo = {}
+    metainfo = bdecode(data)
+    info = metainfo['info']
+    piece_length = info['piece length']
+    if info.has_key('length'):
+        # let's assume we just have one file
+        file_length = info['length']
+    else:
+        # let's assume we have a directory structure
+        file_length = 0;
+        for f in info['files']:
+            file_length += f['length']
+    for key in info:
+        if key != 'pieces':
+            tinfo[key] = info[key]
+    for key in metainfo:
+        if key != 'info':
+            tinfo[key] = metainfo[key]
+    tinfo['size'] = file_length
+    tinfo['hash'] = hashlib.sha1(bencode(info)).hexdigest()
+    tinfo['announce'] = metainfo['announce']
+    return tinfo
+
+def getFiles(data):
+    files = []
+    info = getTorrentInfo(data)
+    if 'files' in info:
+        for f in info['files']:
+            path = [info['name'], ]
+            path.extend(f['path'])
+            files.append(os.path.join(*path))
+    else:
+        files.append(info['name'])
+    return files
+
+def getTorrentSize(torrentFile):
+    "Returns Size of files in torrent file in bytes"
+    return getTorrentInfo(torrentFile)['size']
+
--- a/ox/torrent/bencode.py
+++ b/ox/torrent/bencode.py
@ -0,0 +1,320 @@
+# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman
+# see LICENSE.txt for license information
+
+from types import IntType, LongType, StringType, ListType, TupleType, DictType
+try:
+    from types import BooleanType
+except ImportError:
+    BooleanType = None
+try:
+    from types import UnicodeType
+except ImportError:
+    UnicodeType = None
+from cStringIO import StringIO
+
+def decode_int(x, f):
+    f += 1
+    newf = x.index('e', f)
+    try:
+        n = int(x[f:newf])
+    except:
+        n = long(x[f:newf])
+    if x[f] == '-':
+        if x[f + 1] == '0':
+            raise ValueError
+    elif x[f] == '0' and newf != f+1:
+        raise ValueError
+    return (n, newf+1)
+  
+def decode_string(x, f):
+    colon = x.index(':', f)
+    try:
+        n = int(x[f:colon])
+    except (OverflowError, ValueError):
+        n = long(x[f:colon])
+    if x[f] == '0' and colon != f+1:
+        raise ValueError
+    colon += 1
+    return (x[colon:colon+n], colon+n)
+
+def decode_unicode(x, f):
+    s, f = decode_string(x, f+1)
+    return (s.decode('UTF-8'),f)
+
+def decode_list(x, f):
+    r, f = [], f+1
+    while x[f] != 'e':
+        v, f = decode_func[x[f]](x, f)
+        r.append(v)
+    return (r, f + 1)
+
+def decode_dict(x, f):
+    r, f = {}, f+1
+    lastkey = None
+    while x[f] != 'e':
+        k, f = decode_string(x, f)
+	#why is this needed
+        #if lastkey >= k:
+        #    raise ValueError
+        lastkey = k
+        r[k], f = decode_func[x[f]](x, f)
+    return (r, f + 1)
+
+decode_func = {}
+decode_func['l'] = decode_list
+decode_func['d'] = decode_dict
+decode_func['i'] = decode_int
+decode_func['0'] = decode_string
+decode_func['1'] = decode_string
+decode_func['2'] = decode_string
+decode_func['3'] = decode_string
+decode_func['4'] = decode_string
+decode_func['5'] = decode_string
+decode_func['6'] = decode_string
+decode_func['7'] = decode_string
+decode_func['8'] = decode_string
+decode_func['9'] = decode_string
+#decode_func['u'] = decode_unicode
+  
+def bdecode(x, sloppy = 1):
+    try:
+        r, l = decode_func[x[0]](x, 0)
+#    except (IndexError, KeyError):
+    except (IndexError, KeyError, ValueError):
+        raise ValueError, "bad bencoded data"
+    if not sloppy and l != len(x):
+        raise ValueError, "bad bencoded data"
+    return r
+
+def test_bdecode():
+    try:
+        bdecode('0:0:')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('ie')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('i341foo382e')
+        assert 0
+    except ValueError:
+        pass
+    assert bdecode('i4e') == 4L
+    assert bdecode('i0e') == 0L
+    assert bdecode('i123456789e') == 123456789L
+    assert bdecode('i-10e') == -10L
+    try:
+        bdecode('i-0e')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('i123')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('i6easd')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('35208734823ljdahflajhdf')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('2:abfdjslhfld')
+        assert 0
+    except ValueError:
+        pass
+    assert bdecode('0:') == ''
+    assert bdecode('3:abc') == 'abc'
+    assert bdecode('10:1234567890') == '1234567890'
+    try:
+        bdecode('02:xy')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('l')
+        assert 0
+    except ValueError:
+        pass
+    assert bdecode('le') == []
+    try:
+        bdecode('leanfdldjfh')
+        assert 0
+    except ValueError:
+        pass
+    assert bdecode('l0:0:0:e') == ['', '', '']
+    try:
+        bdecode('relwjhrlewjh')
+        assert 0
+    except ValueError:
+        pass
+    assert bdecode('li1ei2ei3ee') == [1, 2, 3]
+    assert bdecode('l3:asd2:xye') == ['asd', 'xy']
+    assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]]
+    try:
+        bdecode('d')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('defoobar')
+        assert 0
+    except ValueError:
+        pass
+    assert bdecode('de') == {}
+    assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'}
+    assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}}
+    try:
+        bdecode('d3:fooe')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('di1e0:e')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('d1:b0:1:a0:e')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('d1:a0:1:a0:e')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('i03e')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('l01:ae')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('9999:x')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('l0:')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('d0:0:')
+        assert 0
+    except ValueError:
+        pass
+    try:
+        bdecode('d0:')
+        assert 0
+    except ValueError:
+        pass
+
+bencached_marker = []
+
+class Bencached:
+    def __init__(self, s):
+        self.marker = bencached_marker
+        self.bencoded = s
+
+BencachedType = type(Bencached('')) # insufficient, but good as a filter
+
+def encode_bencached(x,r):
+    assert x.marker == bencached_marker
+    r.append(x.bencoded)
+
+def encode_int(x,r):
+    r.extend(('i',str(x),'e'))
+
+def encode_bool(x,r):
+    encode_int(int(x),r)
+
+def encode_string(x,r):    
+    r.extend((str(len(x)),':',x))
+
+def encode_unicode(x,r):
+    #r.append('u')
+    encode_string(x.encode('UTF-8'),r)
+
+def encode_list(x,r):
+        r.append('l')
+        for e in x:
+            encode_func[type(e)](e, r)
+        r.append('e')
+
+def encode_dict(x,r):
+    r.append('d')
+    ilist = x.items()
+    ilist.sort()
+    for k,v in ilist:
+        r.extend((str(len(k)),':',k))
+        encode_func[type(v)](v, r)
+    r.append('e')
+
+encode_func = {}
+encode_func[BencachedType] = encode_bencached
+encode_func[IntType] = encode_int
+encode_func[LongType] = encode_int
+encode_func[StringType] = encode_string
+encode_func[ListType] = encode_list
+encode_func[TupleType] = encode_list
+encode_func[DictType] = encode_dict
+if BooleanType:
+    encode_func[BooleanType] = encode_bool
+if UnicodeType:
+    encode_func[UnicodeType] = encode_unicode
+    
+def bencode(x):
+    r = []
+    try:
+        encode_func[type(x)](x, r)
+    except:
+        print "*** error *** could not encode type %s (value: %s)" % (type(x), x)
+        assert 0
+    return ''.join(r)
+
+def test_bencode():
+    assert bencode(4) == 'i4e'
+    assert bencode(0) == 'i0e'
+    assert bencode(-10) == 'i-10e'
+    assert bencode(12345678901234567890L) == 'i12345678901234567890e'
+    assert bencode('') == '0:'
+    assert bencode('abc') == '3:abc'
+    assert bencode('1234567890') == '10:1234567890'
+    assert bencode([]) == 'le'
+    assert bencode([1, 2, 3]) == 'li1ei2ei3ee'
+    assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee'
+    assert bencode({}) == 'de'
+    assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee'
+    assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee'
+    try:
+        bencode({1: 'foo'})
+        assert 0
+    except AssertionError:
+        pass
+
+  
+try:
+    import psyco
+    psyco.bind(bdecode)
+    psyco.bind(bencode)
+except ImportError:
+    pass
--- a/ox/torrent/btformats.py
+++ b/ox/torrent/btformats.py
@ -0,0 +1,100 @@
+# Written by Bram Cohen
+# see LICENSE.txt for license information
+
+from types import StringType, LongType, IntType, ListType, DictType
+from re import compile
+
+reg = compile(r'^[^/\\.~][^/\\]*$')
+
+ints = (LongType, IntType)
+
+def check_info(info):
+    if type(info) != DictType:
+        raise ValueError, 'bad metainfo - not a dictionary'
+    pieces = info.get('pieces')
+    if type(pieces) != StringType or len(pieces) % 20 != 0:
+        raise ValueError, 'bad metainfo - bad pieces key'
+    piecelength = info.get('piece length')
+    if type(piecelength) not in ints or piecelength <= 0:
+        raise ValueError, 'bad metainfo - illegal piece length'
+    name = info.get('name')
+    if type(name) != StringType:
+        raise ValueError, 'bad metainfo - bad name'
+    if not reg.match(name):
+        raise ValueError, 'name %s disallowed for security reasons' % name
+    if info.has_key('files') == info.has_key('length'):
+        raise ValueError, 'single/multiple file mix'
+    if info.has_key('length'):
+        length = info.get('length')
+        if type(length) not in ints or length < 0:
+            raise ValueError, 'bad metainfo - bad length'
+    else:
+        files = info.get('files')
+        if type(files) != ListType:
+            raise ValueError
+        for f in files:
+            if type(f) != DictType:
+                raise ValueError, 'bad metainfo - bad file value'
+            length = f.get('length')
+            if type(length) not in ints or length < 0:
+                raise ValueError, 'bad metainfo - bad length'
+            path = f.get('path')
+            if type(path) != ListType or path == []:
+                raise ValueError, 'bad metainfo - bad path'
+            for p in path:
+                if type(p) != StringType:
+                    raise ValueError, 'bad metainfo - bad path dir'
+                if not reg.match(p):
+                    raise ValueError, 'path %s disallowed for security reasons' % p
+        for i in xrange(len(files)):
+            for j in xrange(i):
+                if files[i]['path'] == files[j]['path']:
+                    raise ValueError, 'bad metainfo - duplicate path'
+
+def check_message(message):
+    if type(message) != DictType:
+        raise ValueError
+    check_info(message.get('info'))
+    if type(message.get('announce')) != StringType:
+        raise ValueError
+
+def check_peers(message):
+    if type(message) != DictType:
+        raise ValueError
+    if message.has_key('failure reason'):
+        if type(message['failure reason']) != StringType:
+            raise ValueError
+        return
+    peers = message.get('peers')
+    if type(peers) == ListType:
+        for p in peers:
+            if type(p) != DictType:
+                raise ValueError
+            if type(p.get('ip')) != StringType:
+                raise ValueError
+            port = p.get('port')
+            if type(port) not in ints or p <= 0:
+                raise ValueError
+            if p.has_key('peer id'):
+                id = p['peer id']
+                if type(id) != StringType or len(id) != 20:
+                    raise ValueError
+    elif type(peers) != StringType or len(peers) % 6 != 0:
+        raise ValueError
+    interval = message.get('interval', 1)
+    if type(interval) not in ints or interval <= 0:
+        raise ValueError
+    minint = message.get('min interval', 1)
+    if type(minint) not in ints or minint <= 0:
+        raise ValueError
+    if type(message.get('tracker id', '')) != StringType:
+        raise ValueError
+    npeers = message.get('num peers', 0)
+    if type(npeers) not in ints or npeers < 0:
+        raise ValueError
+    dpeers = message.get('done peers', 0)
+    if type(dpeers) not in ints or dpeers < 0:
+        raise ValueError
+    last = message.get('last', 0)
+    if type(last) not in ints or last < 0:
+        raise ValueError
--- a/ox/torrent/makemetafile.py
+++ b/ox/torrent/makemetafile.py
@ -0,0 +1,270 @@
+# Written by Bram Cohen
+# multitracker extensions by John Hoffman
+# see LICENSE.txt for license information
+
+from os.path import getsize, split, join, abspath, isdir
+from os import listdir
+from hashlib import sha1 as sha
+from copy import copy
+from string import strip
+from bencode import bencode
+from btformats import check_info
+from threading import Event
+from time import time
+from traceback import print_exc
+try:
+    from sys import getfilesystemencoding
+    ENCODING = getfilesystemencoding()
+except:
+    from sys import getdefaultencoding
+    ENCODING = getdefaultencoding()
+
+defaults = [
+    ('announce_list', '',
+        'a list of announce URLs - explained below'),
+    ('httpseeds', '',
+        'a list of http seed URLs - explained below'),
+    ('piece_size_pow2', 0,
+        "which power of 2 to set the piece size to (0 = automatic)"),
+    ('comment', '',
+        "optional human-readable comment to put in .torrent"),
+    ('filesystem_encoding', '',
+        "optional specification for filesystem encoding " +
+        "(set automatically in recent Python versions)"),
+    ('target', '',
+        "optional target file for the torrent")
+    ]
+
+default_piece_len_exp = 18
+
+ignore = ['core', 'CVS']
+
+def print_announcelist_details():
+    print ('    announce_list = optional list of redundant/backup tracker URLs, in the format:')
+    print ('           url[,url...][|url[,url...]...]')
+    print ('                where URLs separated by commas are all tried first')
+    print ('                before the next group of URLs separated by the pipe is checked.')
+    print ("                If none is given, it is assumed you don't want one in the metafile.")
+    print ('                If announce_list is given, clients which support it')
+    print ('                will ignore the <announce> value.')
+    print ('           Examples:')
+    print ('                http://tracker1.com|http://tracker2.com|http://tracker3.com')
+    print ('                     (tries trackers 1-3 in order)')
+    print ('                http://tracker1.com,http://tracker2.com,http://tracker3.com')
+    print ('                     (tries trackers 1-3 in a randomly selected order)')
+    print ('                http://tracker1.com|http://backup1.com,http://backup2.com')
+    print ('                     (tries tracker 1 first, then tries between the 2 backups randomly)')
+    print ('')
+    print ('    httpseeds = optional list of http-seed URLs, in the format:')
+    print ('            url[|url...]')
+    
+def make_meta_file(file, url, params = {}, flag = Event(),
+                   progress = lambda x: None, progress_percent = 1):
+    if params.has_key('piece_size_pow2'):
+        piece_len_exp = params['piece_size_pow2']
+    else:
+        piece_len_exp = default_piece_len_exp
+    if params.has_key('target') and params['target'] != '':
+        f = params['target']
+    else:
+        a, b = split(file)
+        if b == '':
+            f = a + '.torrent'
+        else:
+            f = join(a, b + '.torrent')
+            
+    if piece_len_exp == 0:  # automatic
+        size = calcsize(file)
+        if   size > 8L*1024*1024*1024:   # > 8 gig =
+            piece_len_exp = 21          #   2 meg pieces
+        elif size > 2*1024*1024*1024:   # > 2 gig =
+            piece_len_exp = 20          #   1 meg pieces
+        elif size > 512*1024*1024:      # > 512M =
+            piece_len_exp = 19          #   512K pieces
+        elif size > 64*1024*1024:       # > 64M =
+            piece_len_exp = 18          #   256K pieces
+        elif size > 16*1024*1024:       # > 16M =
+            piece_len_exp = 17          #   128K pieces
+        elif size > 4*1024*1024:        # > 4M =
+            piece_len_exp = 16          #   64K pieces
+        else:                           # < 4M =
+            piece_len_exp = 15          #   32K pieces
+    piece_length = 2 ** piece_len_exp
+
+    encoding = None
+    if params.has_key('filesystem_encoding'):
+        encoding = params['filesystem_encoding']
+    if not encoding:
+        encoding = ENCODING
+    if not encoding:
+        encoding = 'ascii'
+    
+    info = makeinfo(file, piece_length, encoding, flag, progress, progress_percent)
+    if flag.isSet():
+        return
+    check_info(info)
+    h = open(f, 'wb')
+    data = {'info': info, 'announce': strip(url), 'creation date': long(time())}
+    
+    if params.has_key('comment') and params['comment']:
+        data['comment'] = params['comment']
+        
+    if params.has_key('real_announce_list'):    # shortcut for progs calling in from outside
+        data['announce-list'] = params['real_announce_list']
+    elif params.has_key('announce_list') and params['announce_list']:
+        l = []
+        for tier in params['announce_list'].split('|'):
+            l.append(tier.split(','))
+        data['announce-list'] = l
+        
+    if params.has_key('real_httpseeds'):    # shortcut for progs calling in from outside
+        data['httpseeds'] = params['real_httpseeds']
+    elif params.has_key('httpseeds') and params['httpseeds']:
+        data['httpseeds'] = params['httpseeds'].split('|')
+
+    if params.has_key('url-list') and params['url-list']:
+        data['url-list'] = params['url-list'].split('|')
+
+    if params.has_key('playtime') and params['playtime']:
+        data['info']['playtime'] = params['playtime']
+
+    h.write(bencode(data))
+    h.close()
+
+def calcsize(file):
+    if not isdir(file):
+        return getsize(file)
+    total = 0L
+    for s in subfiles(abspath(file)):
+        total += getsize(s[1])
+    return total
+
+
+def uniconvertl(l, e):
+    r = []
+    try:
+        for s in l:
+            r.append(uniconvert(s, e))
+    except UnicodeError:
+        raise UnicodeError('bad filename: '+join(*l))
+    return r
+
+def uniconvert(s, e):
+    try:
+        if s.__class__.__name__ != 'unicode':
+            s = unicode(s,e)
+    except UnicodeError:
+        raise UnicodeError('bad filename: '+s)
+    return s.encode('utf-8')
+
+def makeinfo(file, piece_length, encoding, flag, progress, progress_percent=1):
+    file = abspath(file)
+    if isdir(file):
+        subs = subfiles(file)
+        subs.sort()
+        pieces = []
+        sh = sha()
+        done = 0L
+        fs = []
+        totalsize = 0.0
+        totalhashed = 0L
+        for p, f in subs:
+            totalsize += getsize(f)
+
+        for p, f in subs:
+            pos = 0L
+            size = getsize(f)
+            fs.append({'length': size, 'path': uniconvertl(p, encoding)})
+            h = open(f, 'rb')
+            while pos < size:
+                a = min(size - pos, piece_length - done)
+                sh.update(h.read(a))
+                if flag.isSet():
+                    return
+                done += a
+                pos += a
+                totalhashed += a
+                
+                if done == piece_length:
+                    pieces.append(sh.digest())
+                    done = 0
+                    sh = sha()
+                if progress_percent:
+                    progress(totalhashed / totalsize)
+                else:
+                    progress(a)
+            h.close()
+        if done > 0:
+            pieces.append(sh.digest())
+        return {'pieces': ''.join(pieces),
+            'piece length': piece_length, 'files': fs, 
+            'name': uniconvert(split(file)[1], encoding) }
+    else:
+        size = getsize(file)
+        pieces = []
+        p = 0L
+        h = open(file, 'rb')
+        while p < size:
+            x = h.read(min(piece_length, size - p))
+            if flag.isSet():
+                return
+            pieces.append(sha(x).digest())
+            p += piece_length
+            if p > size:
+                p = size
+            if progress_percent:
+                progress(float(p) / size)
+            else:
+                progress(min(piece_length, size - p))
+        h.close()
+        return {'pieces': ''.join(pieces), 
+            'piece length': piece_length, 'length': size, 
+            'name': uniconvert(split(file)[1], encoding) }
+
+def subfiles(d):
+    r = []
+    stack = [([], d)]
+    while len(stack) > 0:
+        p, n = stack.pop()
+        if isdir(n):
+            for s in listdir(n):
+                if s not in ignore and s[:1] != '.':
+                    stack.append((copy(p) + [s], join(n, s)))
+        else:
+            r.append((p, n))
+    return r
+
+
+def completedir(dir, url, params = {}, flag = Event(),
+                vc = lambda x: None, fc = lambda x: None):
+    files = listdir(dir)
+    files.sort()
+    ext = '.torrent'
+    if params.has_key('target'):
+        target = params['target']
+    else:
+        target = ''
+
+    togen = []
+    for f in files:
+        if f[-len(ext):] != ext and (f + ext) not in files:
+            togen.append(join(dir, f))
+        
+    total = 0
+    for i in togen:
+        total += calcsize(i)
+
+    subtotal = [0]
+    def callback(x, subtotal = subtotal, total = total, vc = vc):
+        subtotal[0] += x
+        vc(float(subtotal[0]) / total)
+    for i in togen:
+        fc(i)
+        try:
+            t = split(i)[-1]
+            if t not in ignore and t[0] != '.':
+                if target != '':
+                    params['target'] = join(target,t+ext)
+                make_meta_file(i, url, params, flag, progress = callback, progress_percent = 0)
+        except ValueError:
+            print_exc()