add ox.web to this repos

2010-07-08 01:25:57 +02:00 · 2010-07-08 01:25:57 +02:00 · 06d61943ac
commit 06d61943ac
parent 0d354d2574
29 changed files with 2123 additions and 9 deletions
--- a/31
+++ b/31
@ -1,22 +1,37 @@
-python-oxlib some tools to build tools
+python-ox some tools to build tools

 Depends:
 python2.5
 python-chardet (http://chardet.feedparser.org/)
+ python-feedparser (http://www.feedparser.org/)
+ python-beautifulsoup (http://www.crummy.com/software/BeautifulSoup/)

 Usage:
- import oxlib
+ import ox
 
- data = oxlib.cache.readUrl('http:/...')
- text = oxlib.stripTags(data)
- oxlib.normalizeNewlines(text)
- oxlib.formatBytes(len(data))
+ data = ox.cache.readUrl('http:/...')
+ text = ox.stripTags(data)
+ ox.normalizeNewlines(text)
+ ox.formatBytes(len(data))

- oxlib.formatBytes(1234567890)
+ ox.formatBytes(1234567890)
 '1.15 GB'

+ import ox.web.imdb
+ imdbId = ox.web.imdb.guess('The Matrix')
+ info = ox.web.imdb.Imdb(imdbId)
+ info['year']
+ 1999
+
 Install:
  python setup.py install

+Cookies:
+  some ox.web modules require user accont information or cookies to work,
+  those are saved in ~/.ox/auth.json, most basic form looks like this:
+  {
+    "key": "value"
+  }
+
 Tests:
- nosetests --with-doctest oxlib
+ nosetests --with-doctest ox
--- a/ox/init.py
+++ b/ox/init.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 # GPL 2008
-__version__ = '1.0.0'
+__version__ = '2.0.0'

 from file import *
 from format import *
--- a/ox/web/init.py
+++ b/ox/web/init.py
@ -0,0 +1,9 @@
+# vi:si:et:sw=4:sts=4:ts=4
+# encoding: utf-8
+__version__ = '1.0.0'
+
+import imdb
+import wikipedia
+import google
+import piratecinema
+import oxdb
--- a/ox/web/aaaarg.py
+++ b/ox/web/aaaarg.py
@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+import os
+import string
+
+from ox import cache
+from ox.html import stripTags, decodeHtml
+from ox.text import findRe
+from ox.normalize import canonicalName
+import auth
+
+
+def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
+    headers = headers.copy()
+    headers["Cookie"] = auth.get("aaaarg.cookie")
+    return cache.readUrl(url, data, headers, timeout)
+
+def readUrlUnicode(url, timeout=cache.cache_timeout):
+   return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
+
+def downloadText(id, filename=None):
+    #FIXME, what about the cache, this keeps all pdfs in oxcache...
+    url='http://a.aaaarg.org/node/%d/download' % id
+    data = readUrl(url, timeout=-1)
+    headers = cache.getHeaders(url, timeout=-1)
+    if filename:
+        with open(filename, "w") as f:
+            f.write(data)
+        return
+    return data
+
+def getTextByLetter(letter):
+    texts = []
+    url = 'http://a.aaaarg.org/library/%s' % letter
+    data = readUrlUnicode(url)
+    txts = re.compile('<li class="author">(.*?)</li><li class="title"><a href="(.*?)">(.*?)</a></li>').findall(data)
+    author = 'Unknown Author'
+    for r in txts:
+        if r[0] != '&nbsp;':
+            author = r[0]
+        link = r[1]
+        id = findRe(link, '/(\d+)')
+        title = decodeHtml(r[2])
+        author_foder =  canonicalName(author)
+        author_foder = os.path.join(author_foder[0], author_foder)
+        filename = os.path.join(author_foder, '%s (aaarg %s).pdf' %  (title.replace('/', '_'), id))
+        texts.append({
+            'author': author,
+            'title': title,
+            'id': id,
+            'filename': filename,
+         })
+    return texts
+
+def getTexts():
+    texts = []
+    for letter in string.letters[:26]:
+        texts += getTextByLetter(letter)
+    return texts
+
--- a/ox/web/allmovie.py
+++ b/ox/web/allmovie.py
@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+import time
+
+from ox import stripTags, findRe
+from ox.cache import readUrlUnicode
+
+
+def getId(url):
+    return url.split("/")[-2]
+
+def getData(id):
+    '''
+    >>> getData('129689')['cast'][1][1]
+    u'Marianne'
+    >>> getData('129689')['credits'][0][0]
+    u'Jean-Luc Godard'
+    >>> getData('129689')['posters'][0]
+    u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
+    >>> getData('129689')['rating']
+    u'4.5'
+    '''
+    data = {
+        "url": getUrl(id)
+    }
+    html = readUrlUnicode(data["url"])
+    data['aka'] = parseList(html, 'AKA')
+    data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)</a>')
+    data['countries'] = parseList(html, 'Countries')
+    data['director'] = parseEntry(html, 'Director')
+    data['genres'] = parseList(html, 'Genres')
+    data['keywords'] = parseList(html, 'Keywords')
+    data['posters'] = [findRe(html, '<img src="(http://image\..*?)"')]
+    data['produced'] = parseList(html, 'Produced by')
+    data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
+    data['released'] = parseEntry(html, 'Released by')
+    data['releasedate'] = parseEntry(html, 'Release')[0:10].replace(' ', '-')
+    data['runtime'] = findRe(html, '<td class="formed-sub" style="width: 86px;">(\d+) min.</td>')
+    data['set'] = parseEntry(html, 'Set In')
+    data['synopsis'] = parseText(html, 'Plot Synopsis')
+    data['themes'] = parseList(html, 'Themes')
+    data['types'] = parseList(html, 'Types')
+    data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"')
+    html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
+    data['cast'] = parseTable(html)
+    html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
+    data['credits'] = parseTable(html)
+    html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
+    data['review'] = parseText(html, 'Review')
+    return data
+
+def getUrl(id):
+    return "http://allmovie.com/work/%s/" % id
+
+def parseEntry(html, title):
+    return stripTags(findRe(html, '<span>%s</span>(.*?)</table>' % title)).strip()
+
+def parseList(html, title):
+    html = findRe(html, '<span>%s</span>(.*?)</table>' % title)
+    return map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
+
+def parseTable(html):
+    return map(
+        lambda x: map(
+            lambda x: stripTags(x).strip().replace('&nbsp;', ''),
+            x.split('<td width="305">-')
+        ),
+        findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
+    )
+
+def parseText(html, title):
+    return stripTags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
+
+if __name__ == '__main__':
+    print getData('129689')
+    # print getData('177524')
+
--- a/ox/web/auth.py
+++ b/ox/web/auth.py
@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+# GPL 2009
+import os
+import simplejson
+
+
+def get(key):
+    user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
+    auth = {}
+    if os.path.exists(user_auth):
+        f = open(user_auth, "r")
+        data = f.read()
+        f.close()
+        auth = simplejson.loads(data)
+    if key in auth:
+        return auth[key]
+    print "please add key %s to json file '%s'" % (key, user_auth)
+    return ""
+
--- a/ox/web/criterion.py
+++ b/ox/web/criterion.py
@ -0,0 +1,90 @@
+# -*- coding: UTF-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+
+import ox.cache
+from ox.cache import readUrlUnicode
+from ox.html import stripTags
+from ox.text import findRe, removeSpecialCharacters
+
+import imdb
+
+def getId(url):
+    return url.split("/")[-1]
+
+def getUrl(id):
+    return "http://www.criterion.com/films/%s" % id
+
+def getData(id):
+    '''
+    >>> getData('1333')['imdbId']
+    '0060304'
+
+    >>> getData('236')['posters'][0]
+    'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg'
+
+    >>> getData('786')['posters'][0]
+    'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg'
+    '''
+    data = {
+        "url": getUrl(id)
+    }
+    try:
+        html = readUrlUnicode(data["url"])
+    except:
+        html = ox.cache.getUrl(data["url"])
+    data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>")
+    data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>")
+    data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>")
+    results = re.compile("<p><strong>(.*?)</strong></p>").findall(html)
+    data["country"] = results[0]
+    data["year"] = results[1]
+    result = findRe(html, "<div class=\"synopsis contentbox lightgray\">(.*?)</div>")
+    data["synopsis"] = findRe(result, "<p>(.*?)</p>")
+    result = findRe(html, "<div class=\"editioninfo\">(.*?)</div>")
+    if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
+        result = re.compile("<div class=\"editioninfo\">(.*?)</div>", re.DOTALL).findall(html)[1]
+    result = findRe(result, "<a href=\"(.*?)\">")
+    if not "/boxsets/" in result:
+        data["posters"] = [result]
+    else:
+        html_ = readUrlUnicode(result)
+        result = findRe(html_, "<a href=\"http://www.criterion.com/films/%s\">(.*?)</a>" % id)
+        result = findRe(result, "src=\"(.*?)\"")
+        data["posters"] = [result.replace("_w100", "")]
+    result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
+    if result:
+        data["stills"] = [result]
+        data["trailers"] = []
+    else:
+        data["stills"] = [findRe(html, "\"thumbnailURL\", \"(.*?)\"")]
+        data["trailers"] = [findRe(html, "\"videoURL\", \"(.*?)\"")]
+    data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year'])
+    return data
+
+def getIds():
+    ids = []
+    html = readUrlUnicode("http://www.criterion.com/library/dvd")
+    results = re.compile("page=(.*?)\"").findall(html)
+    pages = int(results[len(results) - 2])
+    for page in range(pages, 0, -1):
+        for id in getIdsByPage(page):
+            ids.append(id)
+    return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
+
+def getIdsByPage(page):
+    ids = []
+    html = readUrlUnicode("http://www.criterion.com/library/dvd?page=%s" % page)
+    results = re.compile("films/(.*?)\"").findall(html)
+    for result in results:
+        ids.append(result)
+    results = re.compile("boxsets/(.*?)\"").findall(html)
+    for result in results:
+        html = readUrlUnicode("http://www.criterion.com/boxsets/" + result)
+        results = re.compile("films/(.*?)\"").findall(html)
+        for result in results:
+            ids.append(result)
+    return set(ids)
+
+if __name__ == '__main__':
+    print getIds()
--- a/ox/web/dailymotion.py
+++ b/ox/web/dailymotion.py
@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+from urllib import unquote
+from ox.cache import readUrl
+
+
+def getVideoUrl(url):
+    '''
+    >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?key')[0]
+    'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv'
+
+    >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?key')[0]
+    'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv'
+    '''
+    data = readUrl(url)
+    video = re.compile('''video", "(.*?)"''').findall(data)
+    for v in video:
+       v =  unquote(v).split('@@')[0]
+       return "http://www.dailymotion.com" + v
+    return ''
+
--- a/ox/web/epguides.py
+++ b/ox/web/epguides.py
@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+import time
+
+from ox import stripTags, findRe
+from ox.cache import readUrlUnicode
+
+import google
+
+
+def getShowUrl(title):
+    ''' 
+    Search Epguide Url for Show via Show Title.
+    Use Google to search the url, this is also done on Epguide.
+    '''
+    for (name, url, desc) in google.find('allintitle: site:epguides.com %s' % title, 1):
+        if url.startswith('http://epguides.com'):
+              if re.search(title, name):
+                    return url
+    return None
+
+def getShowData(url):
+    data = readUrlUnicode(url)
+    r = {}
+    r['title'] = stripTags(findRe(data, '<h1>(.*?)</h1>'))
+    r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
+    r['episodes'] = {}
+    #1.   1- 1       1001      7 Aug 05   You Can't Miss the Bear
+    for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
+        air_date = episode[3].strip()
+        #'22 Sep 04' -> 2004-09-22 
+        try:
+            air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
+        except:
+            pass
+        s = episode[1].split('-')[0].strip()
+        e = episode[1].split('-')[-1].strip()
+        try:
+            r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
+                'prod code': episode[2],
+                'air date': air_date,
+                'url': episode[4],
+                'title':episode[5],
+            }
+        except:
+            print "oxweb.epguides failed,", url
+    return r
+
--- a/ox/web/google.py
+++ b/ox/web/google.py
@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+import time
+import urllib
+import urllib2
+import weakref
+import threading
+import Queue
+import simplejson
+
+
+import ox
+from ox import stripTags
+
+
+'''
+usage:
+import google
+google.find(query)
+
+for result in google.find(query): result
+
+result is title, url, description
+
+google.find(query, max_results)
+
+FIXME: how search depper than first page?
+'''
+DEFAULT_MAX_RESULTS = 10
+DEFAULT_TIMEOUT = 24*60*60
+
+def readUrl(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
+    return ox.cache.readUrl(url, data, headers, timeout)
+
+def quote_plus(s):
+    return urllib.quote_plus(s.encode('utf-8'))
+
+def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
+    url = "http://www.google.com/search?q=%s" % quote_plus(query)
+    data = readUrl(url, timeout=timeout)
+    link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +  \
+              r'.*?(?:<br>|<table.*?>)' +  \
+              r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
+    results = []
+    for match in re.compile(link_re, re.DOTALL).finditer(data):
+        (name, url, desc) = match.group('name', 'url', 'desc')
+        results.append((stripTags(name), url, stripTags(desc)))
+    if len(results) > max_results:
+        results = results[:max_results]
+    return results
+
+def _find(query):
+    url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query)
+    results = simplejson.loads(ox.cache.readUrlUnicode(url))['responseData']['results']
+    return results
+
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -0,0 +1,210 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import urllib2
+from urllib import quote, unquote
+import re
+import os
+import time
+
+import ox
+from ox import findRe
+from ox.normalize import normalizeTitle, normalizeImdbId
+
+from siteparser import SiteParser
+import google
+
+class Imdb(SiteParser):
+    regex =  {
+        'cast': {
+            'page': 'combined',
+            're': '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
+            'type': 'list'
+        },
+        'cinematographers': {
+            'page': 'combined',
+            're': [
+                'Cinematography by</a>(.*?)</table>',
+                '<a href="/name/.*?/">(.*?)</a>'
+            ],
+            'type': 'list'
+        },
+        'connections': {
+            'page': 'movieconnections',
+            're': '<h5>(.*?)</h5>(.*?)\n\n',
+            'type': 'list'
+        },
+        'countries': {
+            'page': 'combined',
+            're': '<a href="/Sections/Countries/.*?/">(.*?)</a>',
+            'type': 'list'
+        },
+        'directors': {
+            'page': 'combined',
+            're': [
+                'Directed by</a>(.*?)</table>',
+                '<a href="/name/.*?/">(.*?)</a>'
+            ],
+            'type': 'list'
+        },
+        'editors': {
+            'page': 'combined',
+            're': [
+                'Film Editing by</a>(.*?)</table>',
+                '<a href="/name/.*?/">(.*?)</a>'
+            ],
+            'type': 'list'
+        },
+        'filming_locations': {
+            'page': 'locations',
+            're': '<a href="/search/title\?locations=.*?">(.*?)</a>',
+            'type': 'list'
+        },
+        'genres': {
+            'page': 'combined',
+            're': '<a href="/Sections/Genres/.*?/">(.*?)</a>',
+            'type': 'list'
+        },
+        'keywords': {
+            'page': 'keywords',
+            're': '<a href="/keyword/.*?/">(.*?)</a>',
+            'type': 'list'
+        },
+        'languages': {
+            'page': 'combined',
+            're': '<a href="/Sections/Languages/.*?/">(.*?)</a>',
+            'type': 'list'
+        },
+        'plot': {
+            'page': 'plotsummary',
+            're': '<p class="plotpar">(.*?)<i>',
+            'type': 'string'
+        },
+        'poster_id': {
+            'page': 'combined',
+            're': '/primary-photo/media/rm(.*?)/tt',
+            'type': 'list'
+        },
+        'poster_ids': {
+            'page': 'posters',
+            're': '/unknown-thumbnail/media/rm(.*?)/tt',
+            'type': 'list'
+        },
+        'producers': {
+            'page': 'combined',
+            're': [
+                'Produced by</a>(.*?)</table>',
+                '<a href="/name/.*?/">(.*?)</a>'
+            ],
+            'type': 'list'
+        },
+        'rating': {
+            'page': 'combined',
+            're': '<div class="starbar-meta">.*?<b>(.*?)/10</b>',
+            'type': 'float'
+        },
+        'release_date': {
+            'page': 'releaseinfo',
+            're': '<a href="/date/(\d{2})-(\d{2})/">.*?</a> <a href="/year/(\d{4})/">',
+            'type': 'date'
+        },
+        'runtime': {
+            'page': 'combined',
+            're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
+            'type': 'string'
+        },
+        'title': {
+            'page': 'combined',
+            're': '<h1>(.*?) <span>',
+            'type': 'string'
+        },
+        'trivia': {
+            'page': 'trivia',
+            're': '<div class="sodatext">(.*?)<br>',
+            'type': 'list',
+        },
+        'votes': {
+            'page': 'combined',
+            're': '<a href="ratings" class="tn15more">(.*?) votes</a>',
+            'type': 'string'
+        },
+        'writers': {
+            'page': 'combined',
+            're': [
+                'Writing credits</a>(.*?)</table>',
+                '<a href="/name/.*?/">(.*?)</a>'
+            ],
+            'type': 'list'
+        },
+        'year': {
+            'page': 'combined',
+            're': '<a href="/year/(\d{4})/">',
+            'type': 'int'
+        }
+    }
+
+    def __init__(self, id):
+        self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
+        super(Imdb, self).__init__()
+
+        if 'runtime' in self:
+            if 'min' in self['runtime']: base=60
+            else: base=1
+            self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
+
+        if 'connections' in self:
+            cc={}
+            for rel, data in self['connections']:
+                cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)
+            self['connections'] = cc
+
+def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
+    #FIXME: proper file -> title
+    title = title.split('-')[0]
+    title = title.split('(')[0]
+    title = title.split('.')[0]
+    title = title.strip()
+    imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
+    return_url = ''
+
+    #lest first try google
+    #i.e. site:imdb.com Michael Stevens Sin
+    if director:
+        search = 'site:imdb.com %s "%s"' % (director, title)
+    else:
+        search = 'site:imdb.com "%s"' % title
+    for (name, url, desc) in google.find(search, 2, timeout=timeout):
+        if url.startswith('http://www.imdb.com/title/tt'):
+             return normalizeImdbId(int(ox.intValue(url)))
+
+    try:
+        req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
+        u = urllib2.urlopen(req)
+        data = u.read()
+        return_url = u.url
+        u.close()
+    except:
+        return None
+    if return_url.startswith('http://www.imdb.com/title/tt'):
+        return return_url[28:35]
+    if data:
+        imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
+        if imdb_id:
+            return imdb_id
+
+    imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
+    req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
+    u = urllib2.urlopen(req)
+    data = u.read()
+    return_url = u.url
+    u.close()
+    if return_url.startswith('http://www.imdb.com/title/tt'):
+        return return_url[28:35]
+
+    return None
+
+
+if __name__ == "__main__":
+    import json
+    print json.dumps(Imdb('0306414'), indent=2)
+    #print json.dumps(Imdb('0133093'), indent=2)
+
--- a/ox/web/impawards.py
+++ b/ox/web/impawards.py
@ -0,0 +1,84 @@
+# vi:si:et:sw=4:sts=4:ts=4
+# encoding: utf-8
+import re
+
+from ox.cache import readUrlUnicode
+from ox.html import stripTags
+from ox.text import findRe
+
+import imdb
+
+def getData(id):
+    '''
+    >>> getData('1991/silence_of_the_lambs')['imdbId']
+    u'0102926'
+
+    >>> getData('1991/silence_of_the_lambs')['posters'][0]
+    u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1_xlg.jpg'
+
+    >>> getData('1991/silence_of_the_lambs')['url']
+    u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
+    '''
+    data = {
+        'url': getUrl(id)
+    }
+    html = readUrlUnicode(data['url'])
+    data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
+    data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
+    data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
+    data['posters'] = []
+    results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)
+    for result in results:
+        result = result.replace('_xlg.html', '.html')
+        url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
+        html = readUrlUnicode(url)
+        result = findRe(html, '<a href = (\w*?_xlg.html)')
+        if result:
+            url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
+            html = readUrlUnicode(url)
+            poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
+        else:
+            poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)" alt='))
+        data['posters'].append(poster)
+    return data
+
+def getId(url):
+    split = url.split('/')
+    year = split[3]
+    split = split[4][:-5].split('_')
+    if split[-1] == 'xlg':
+        split.pop()
+    if findRe(split[-1], 'ver\d+$'):
+        split.pop()
+    id = '%s/%s' % (year, '_'.join(split))
+    return id
+
+def getIds():
+    ids = []
+    html = readUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60)
+    pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
+    for page in range(pages, 0, -1):
+        for id in getIdsByPage(page):
+            if not id in ids:
+                ids.append(id)
+    return ids
+
+def getIdsByPage(page):
+    ids = []
+    html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
+    results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
+    for result in results:
+        url = 'http://impawards.com/%s' % result
+        ids.append(getId(url))
+    return set(ids)
+
+def getUrl(id):
+    url = "http://www.impawards.com/%s.html" % id
+    html = readUrlUnicode(url)
+    if findRe(html, "No Movie Posters on This Page"):
+        url = "http://www.impawards.com/%s_ver1.html" % id
+    return url
+
+if __name__ == '__main__':
+    ids = getIds()
+    print sorted(ids), len(ids)
--- a/ox/web/itunes.py
+++ b/ox/web/itunes.py
@ -0,0 +1,187 @@
+# vi:si:et:sw=4:sts=4:ts=4
+# encoding: utf-8
+import re
+import urllib
+
+from ox.cache import readUrl
+from ox.html import decodeHtml, stripTags
+from ox.text import findRe
+from ox.text import findString
+
+
+# to sniff itunes traffic, use something like
+# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
+
+# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit
+# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit
+
+ITUNES_HEADERS = {
+    'X-Apple-Tz': '0',
+    'X-Apple-Storefront': '143441-1',
+    'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)',
+    'Accept-Language': 'en-us, en;q=0.50',
+    'Accept-Encoding': 'gzip',
+    'Connection': 'close',
+}
+
+def composeUrl(request, parameters):
+    if request == 'advancedSearch':
+        url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
+        if parameters['media'] == 'music':
+            url += urllib.urlencode({
+              'albumTerm': parameters['title'],
+              'allArtistNames': parameters['artist'],
+              'composerTerm': '',
+              'flavor': 0,
+              'genreIndex': 1,
+              'media': 'music',
+              'mediaType': 2,
+              'ringtone': 0,
+              'searchButton': 'submit',
+              'songTerm': ''
+            })
+        elif parameters['media'] == 'movie':
+            url += urllib.urlencode({
+              'actorTerm': '',
+              'closedCaption': 0,
+              'descriptionTerm': '',
+              'directorProducerName': parameters['director'],
+              'flavor': 0,
+              'media': 'movie',
+              'mediaType': 3,
+              'movieTerm': parameters['title'],
+              'ratingIndex': 1,
+              'releaseYearTerm': '',
+              'searchButton': 'submit'
+            })
+    elif request == 'viewAlbum':
+        url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
+    elif request == 'viewMovie':
+        url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
+    return url
+
+def parseXmlDict(xml):
+    values = {}
+    strings = xml.split('<key>')
+    for string in strings:
+        if string.find('</key>') != -1:
+            key = findRe(string, '(.*?)</key>')
+            type = findRe(string, '</key><(.*?)>')
+            if type == 'true/':
+                value = True
+            else:
+                value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
+                if type == 'integer':
+                  value = int(value)
+                elif type == 'string':
+                  value = decodeHtml(value)
+            values[key] = value
+    return values
+
+def parseCast(xml, title):
+    list = []
+    try:
+        strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
+        strings.pop()
+        for string in strings:
+            list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
+        return list
+    except:
+        return list
+
+def parseMovies(xml, title):
+    list = []
+    try:
+        strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
+        strings.pop()
+        for string in strings:
+            list.append({
+              'id': findRe(string, 'viewMovie\?id=(.*?)&'),
+              'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
+            })
+        return list
+    except:
+        return list
+
+class ItunesAlbum:
+    def __init__(self, id = '', title = '', artist = ''):
+        self.id = id
+        self.title = title
+        self.artist = artist
+        if not id:
+            self.id = self.getId()
+
+    def getId(self):
+        url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
+        xml = readUrl(url, headers = ITUNES_HEADERS)
+        id = findRe(xml, 'viewAlbum\?id=(.*?)&')
+        return id
+
+    def getData(self):
+        data = {'id': self.id}
+        url = composeUrl('viewAlbum', {'id': self.id})
+        xml = readUrl(url, None, ITUNES_HEADERS)
+        data['albumName'] = findRe(xml, '<B>(.*?)</B>')
+        data['artistName'] = findRe(xml, '<b>(.*?)</b>')
+        data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
+        data['genre'] = findRe(xml, 'Genre:(.*?)<')
+        data['releaseDate'] = findRe(xml, 'Released(.*?)<')
+        data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
+        data['tracks'] = []
+        strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
+        for string in strings:
+          data['tracks'].append(parseXmlDict(string))
+        data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
+        return data
+
+class ItunesMovie:
+    def __init__(self, id = '', title = '', director = ''):
+        self.id = id
+        self.title = title
+        self.director = director
+        if not id:
+            self.id = self.getId()
+
+    def getId(self):
+        url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
+        xml = readUrl(url, headers = ITUNES_HEADERS)
+        id = findRe(xml, 'viewMovie\?id=(.*?)&')
+        return id
+
+    def getData(self):
+        data = {'id': self.id}
+        url = composeUrl('viewMovie', {'id': self.id})
+        xml = readUrl(url, None, ITUNES_HEADERS)
+        f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
+        f.write(xml)
+        f.close()
+        data['actors'] = parseCast(xml, 'actors')
+        string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
+        data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
+        data['directors'] = parseCast(xml, 'directors')
+        data['format'] = findRe(xml, 'Format:(.*?)<')
+        data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
+        data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
+        data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
+        data['producers'] = parseCast(xml, 'producers')
+        data['rated'] = findRe(xml, 'Rated(.*?)<')
+        data['relatedMovies'] = parseMovies(xml, 'related movies')
+        data['releaseDate'] = findRe(xml, 'Released(.*?)<')
+        data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
+        data['screenwriters'] = parseCast(xml, 'screenwriters')
+        data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
+        data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
+        return data
+
+if __name__ == '__main__':
+    import simplejson
+    data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
+    print simplejson.dumps(data, sort_keys = True, indent = 4)
+    data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
+    print simplejson.dumps(data, sort_keys = True, indent = 4)
+    for v in data['relatedMovies']:
+        data = ItunesMovie(id = v['id']).getData()
+        print simplejson.dumps(data, sort_keys = True, indent = 4)
+    data = ItunesMovie(id='272960052').getData()
+    print simplejson.dumps(data, sort_keys = True, indent = 4)
+
--- a/ox/web/karagarga.py
+++ b/ox/web/karagarga.py
@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+from ox import cache
+from ox.html import stripTags
+from ox.text import findRe
+
+import auth
+
+
+def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
+    headers = headers.copy()
+    headers["Cookie"] = auth.get("karagarga.cookie")
+    return cache.readUrl(url, data, headers, timeout)
+
+def readUrlUnicode(url, timeout=cache.cache_timeout):
+   return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
+
+def getData(id):
+    data = {
+        "url": getUrl(id)
+    }
+    html = readUrlUnicode("%s%s" % (data["url"], "&filelist=1"))
+    if 'No torrent with ID' in html:
+        return False
+    data['added'] = stripTags(parseTable(html, 'Added'))
+    data['country'] = findRe(html, 'title="([\w ]*?)" border="0" width="32" height="20"')
+    # data['description'] = parseTable(html, 'Description')
+    data['director'] = stripTags(parseTable(html, 'Director / Artist'))
+    data['files'] = []
+    result = findRe(html, '<table class=main border="1" cellspacing=0 cellpadding="5">(.*?)</table>')
+    results = re.compile('<td>(.*?)</td><td align="right">(.*?)</td>', re.DOTALL).findall(result)
+    for name, size in results:
+        data['files'].append({
+            'name': name,
+            'size': '%s %s' % (size[:-2], size[-2:].strip().upper())
+        })
+    data['format'] = ''
+    if html.find('genreimages/dvdr.png') != -1:
+        data['format'] = 'DVD'
+    elif html.find('genreimages/hdrip.png') != -1:
+        data['format'] = 'HD'
+    data['genre'] = []
+    result = parseTable(html, 'Genres')
+    for string in result.split('\n'):
+        string = stripTags(findRe(string, '<a href="browse.php\?genre=.*?">(.*?)</a>'))
+        if string:
+            data['genre'].append(string)
+    data['id'] = id
+    data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
+    data['language'] = stripTags(parseTable(html, 'Language'))
+    data['leechers'] = int(findRe(html, 'seeder\(s\), (.*?) leecher\(s\)'))
+    data['link'] = stripTags(parseTable(html, 'Internet Link'))
+    data['links'] = []
+    results = re.compile('<a href="(.*?)">(.*?)</a>', re.DOTALL).findall(parseTable(html, 'Description'))
+    for (url, title) in results:
+        if url.find('javascript') == -1:
+            data['links'].append({
+                'title': title,
+                'url': url.replace('http://anonym.to/?', '')
+            })
+    data['people'] = 0
+    result = stripTags(findRe(html, '<a href="top10others.php.*?>(.*?) people')).strip()
+    if result:
+        data['people'] = int(result)
+    data['posters'] = []
+    results = re.compile('<img border=0 src="(http://.*?)"', re.DOTALL).findall(html)
+    for result in results:
+        data['posters'].append(result)
+    data['seeders'] = int(findRe(html, '#seeders" class="sublink".*?colspan=2>(.*?) seeder\(s\)'))
+    data['size'] = int(findRe(parseTable(html, 'Size'), '\((.*?) ').replace(',', ''))
+    data['snatched'] = int(findRe(html, '<a name="snatchers">.*?colspan=2>(.*?) '))
+    data['subtitle'] = findRe(parseTable(html, 'Subtitles'), '>(.*?)<hr>').replace('included: ', '')
+    data['subtitles'] = []
+    results = re.compile('<a href="(.*?)">(.*?)</a>', re.DOTALL).findall(parseTable(html, 'Subtitles'))
+    for (url, language) in results:
+        data['subtitles'].append({
+            'language': language.replace('click here for ', ''),
+            'url': url
+        })
+    data['torrent'] = 'http://karagarga.net/%s' % findRe(html, '(down.php/.*?)"')
+    data['year'] = stripTags(parseTable(html, 'Year'))
+    data['title'] = stripTags(findRe(html, '<h1>(.*?)</h1>')).strip()
+    data['title'] = re.sub('^%s - ' % re.escape(data['director']), '', data['title'])
+    data['title'] = re.sub(' \(%s\)$' % re.escape(data['year']), '', data['title'])    
+    return data
+
+def getId(url):
+    return url.split("=")[-1]
+
+def getTorrent(id):
+    return readUrl(getData(id)['torrent'])
+
+def getIds(lastId = 20):
+    lastId = '%s' % lastId
+    ids = []
+    page = 0
+    while True:
+        for id in getIdsByPage(page):
+            if not id in ids:
+                ids.append(id)
+        if lastId in ids:
+            break
+        page += 1
+    return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
+
+def getIdsByPage(page):
+    ids = []
+    url = 'http://karagarga.net/browse.php?page=%s&cat=1&sort=added&d=DESC' % page
+    html = readUrlUnicode(url, timeout = 23*60*60) #get new ids once per day
+    strings = html.split('<td width="42" style="padding:0px;">')
+    strings.pop(0)
+    for string in strings:
+        ids.append(findRe(string, '"details.php\?id=(.*?)"'))
+    return ids
+
+def getUrl(id):
+    return "http://karagarga.net/details.php?id=%s" % id
+
+def parseTable(html, title):
+    if title == 'Genres':
+        return findRe(html, '<td class="heading" [\w=" ]*?>%s</td>(.*?)</table>' % title)
+    else:
+        return findRe(html, '<td class="heading" [\w=" ]*?>%s</td>(.*?)</td>' % title)
+
+if __name__ == "__main__":
+    print getIds("79317")
+    print getData("79317")
--- a/ox/web/lyricsfly.py
+++ b/ox/web/lyricsfly.py
@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from ox.cache import readUrl
+from ox.html import decodeHtml
+from ox.text import findRe
+
+
+def getLyrics(title, artist):
+    html = readUrl('http://lyricsfly.com/api/')
+    key = findRe(html, '<font color=green><b>(.*?)</b></font>')
+    url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
+    xml = readUrl(url)
+    lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
+    lyrics = lyrics.replace('\n', '').replace('\r', '')
+    lyrics = lyrics.replace('[br]', '\n').strip()
+    lyrics.replace('\n\n\n', '\n\n')
+    lyrics = decodeHtml(lyrics.replace('&amp;', '&'))
+    return lyrics
+
+if __name__ == '__main__':
+    print getLyrics('Election Day', 'Arcadia')
--- a/ox/web/metacritic.py
+++ b/ox/web/metacritic.py
@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+from urllib import quote
+
+from ox.cache import readUrl, readUrlUnicode
+from ox import findRe, decodeHtml, stripTags
+
+
+def getMetacriticShowUrl(title):
+    title = quote(title)
+    url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
+    data = readUrl(url)
+    return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
+
+def getData(title, url=None):
+  if not url:
+    url = getMetacriticShowUrl(title)
+  if not url:
+    return None
+  data = readUrlUnicode(url)
+  score = findRe(data, 'ALT="Metascore: (.*?)"')
+  if score: 
+    score = int(score)
+  else: 
+    score = -1
+
+  reviews = re.compile(
+            '<div class="scoreandreview"><div class="criticscore">(.*?)</div>'
+            '.*?<span class="publication">(.*?)</span>'
+            '.*?<span class="criticname">(.*?)</span></div>'
+            '.*?<div class="quote">(.*?)<br>'
+            '.*?<a href="(.*?)" ', re.DOTALL).findall(data)
+
+  metacritics = []
+  for review in reviews:
+    metacritics.append({
+        'score': int(review[0]),
+        'publication':review[1],
+        'critic':decodeHtml(review[2]),
+        'quote': stripTags(review[3]).strip(),
+        'link': review[4],
+    })
+  return dict(score = score, critics = metacritics, url = url)
+
--- a/ox/web/mininova.py
+++ b/ox/web/mininova.py
@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from datetime import datetime
+import re
+import socket
+from urllib import quote
+
+from ox.cache import readUrl, readUrlUnicode
+from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
+from ox.normalize import normalizeImdbId
+import ox
+
+from torrent import Torrent
+
+
+def _parseResultsPage(data, max_results=10):
+    results=[]
+    regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
+    for row in  re.compile(regexp, re.DOTALL).findall(data):
+        torrentDate = row[0]
+        torrentExtra = row[1]
+        torrentId = row[2]
+        torrentTitle = decodeHtml(row[3]).strip()
+        torrentLink = "http://www.mininova.org/tor/" + torrentId
+        privateTracker = 'priv.gif' in torrentExtra
+        if not privateTracker:
+            results.append((torrentTitle, torrentLink, ''))
+    return results
+
+def findMovie(query, max_results=10):
+    '''search for torrents on mininova
+    '''
+    url = "http://www.mininova.org/search/%s/seeds" % quote(query)
+    data = readUrlUnicode(url)
+    return _parseResultsPage(data, max_results)
+
+def findMovieByImdb(imdbId):
+    '''find torrents on mininova for a given imdb id
+    '''
+    results = []
+    imdbId = normalizeImdbId(imdbId)
+    data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
+    return _parseResultsPage(data)
+
+def getId(mininovaId):
+    mininovaId = unicode(mininovaId)
+    d = findRe(mininovaId, "/(\d+)")
+    if d:
+        return d
+    mininovaId = mininovaId.split('/')
+    if len(mininovaId) == 1:
+        return mininovaId[0]
+    else:
+        return mininovaId[-1]
+
+def exists(mininovaId):
+    mininovaId = getId(mininovaId)
+    data = ox.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId)
+    if not data or 'Torrent not found...' in data:
+        return False
+    if 'tracker</a> of this torrent requires registration.' in data:
+        return False
+    return True
+
+def getData(mininovaId):
+    _key_map = {
+        'by': u'uploader',
+    }
+    mininovaId = getId(mininovaId)
+    torrent = dict()
+    torrent[u'id'] = mininovaId
+    torrent[u'domain'] = 'mininova.org'
+    torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
+    torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
+    torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
+
+    data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link'])
+    if '<h1>Torrent not found...</h1>' in data:
+        return None
+
+    for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
+        key = d[0].lower().strip()
+        key = _key_map.get(key, key)
+        value = decodeHtml(stripTags(d[1].strip()))
+        torrent[key] = value
+
+    torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
+    torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
+    torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
+    if torrent['description']:
+        torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
+    t = readUrl(torrent[u'torrent_link'])
+    torrent[u'torrent_info'] = getTorrentInfo(t)
+    return torrent
+
+class Mininova(Torrent):
+    '''
+    >>> Mininova('123')
+    {}
+    >>> Mininova('1072195')['infohash']
+    '72dfa59d2338e4a48c78cec9de25964cddb64104'
+    '''
+    def __init__(self, mininovaId):
+        self.data = getData(mininovaId)
+        if not self.data:
+            return
+        Torrent.__init__(self)
+        ratio = self.data['share ratio'].split(',')
+        self['seeder'] = -1
+        self['leecher'] = -1
+        if len(ratio) == 2:
+            val = intValue(ratio[0].replace(',','').strip())
+            if val:
+                self['seeder'] = int(val)
+            val = intValue(ratio[1].replace(',','').strip())
+            if val:
+                self['leecher'] = int(val)
+        val = intValue(self.data['downloads'].replace(',','').strip())
+        if val:
+            self['downloaded'] = int(val)
+        else:
+            self['downloaded'] = -1
+        published =  self.data['added on']
+        published = published.split(' +')[0]
+        self['published'] =  datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
+
--- a/ox/web/movieposterdb.py
+++ b/ox/web/movieposterdb.py
@ -0,0 +1,44 @@
+# -*- coding: UTF-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+
+import re
+
+from ox.cache import readUrlUnicode
+from ox import findRe
+
+def getData(id):
+    '''
+    >>> getData('0060304')['posters'][0]
+    u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
+    >>> getData('0123456')['posters']
+    []
+    '''
+    data = {
+        "url": getUrl(id)
+    }
+    data["posters"] = getPostersByUrl(data["url"])
+    return data
+
+def getId(url):
+    return url.split("/")[-2]
+
+def getPostersByUrl(url, group=True):
+    posters = []
+    html = readUrlUnicode(url)
+    if url in html:
+        if group:
+            results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
+            for result in results:
+                posters += getPostersByUrl(result, False)
+        results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
+        for result in results:
+            html = readUrlUnicode(result)
+            posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
+    return posters
+
+def getUrl(id):
+    return "http://www.movieposterdb.com/movie/%s/" % id
+
+if __name__ == '__main__':
+    print getData('0060304')
+    print getData('0133093')
--- a/ox/web/opensubtitles.py
+++ b/ox/web/opensubtitles.py
@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+
+import feedparser
+from ox.cache import readUrl, readUrlUnicode
+from ox import findRe, stripTags
+from ox import langCode2To3, langTo3Code
+
+def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
+    if len(language) == 2:
+        language = langCode2To3(language)
+    elif len(language) != 3:
+        language = langTo3Code(language)
+    url = "http://www.opensubtitles.org/en/search/"
+    if language:
+        url += "sublanguageid-%s/" % language
+    url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
+    data = readUrl(url)
+    if "title>opensubtitles.com - search results</title" in data:
+        fd = feedparser.parse(data)
+        opensubtitleId = None
+        if fd.entries:
+            link = fd.entries[0]['links'][0]['href']
+            opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
+            if opensubtitleId:
+                opensubtitleId = opensubtitleId[0]
+    else:
+        opensubtitleId = findRe(data, '/en/subtitles/(.*?)/')
+    return opensubtitleId
+
+def downloadSubtitleById(opensubtitle_id):
+    srts = {}
+    data = readUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
+    reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
+    for f in re.compile(reg_exp, re.DOTALL).findall(data):
+        name = stripTags(f[1]).split('\n')[0]
+        url = "http://www.opensubtitles.com%s" % f[0]
+        srts[name] = readUrlUnicode(url)
+    return srts
+
--- a/ox/web/oxdb.py
+++ b/ox/web/oxdb.py
@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import ox.cache
+
+def getPosterUrl(id):
+    url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
+    if ox.cache.exists(url):
+        return url
+    return ''
+
--- a/ox/web/piratecinema.py
+++ b/ox/web/piratecinema.py
@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import ox.cache
+from ox.cache import exists
+
+
+def getPosterUrl(id):
+    url = "http://piratecinema.org/posters/%s/%s.jpg" % (id[:4], id)
+    if ox.cache.exists(url):
+        return url
+    return ''
+
--- a/ox/web/rottentomatoes.py
+++ b/ox/web/rottentomatoes.py
@ -0,0 +1,34 @@
+# -*- coding: UTF-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+
+from ox.cache import getHeaders, readUrl, readUrlUnicode
+from ox import findRe, stripTags
+
+
+def readUrlByImdb(imdb):
+    #this would also wor but does not cache:
+    '''
+    from urllib2 import urlopen
+    u = urlopen(url)
+    return u.url
+    '''
+    url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
+    data = readUrl(url)
+    if "movie_title" in data:
+        movies = re.compile('(/m/.*?/)').findall(data)
+        if movies:
+            return "http://www.rottentomatoes.com" + movies[0]
+    return None
+
+def getData(url):
+    data = readUrlUnicode(url)
+    r = {}
+    r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
+    if '(' in r['title']:
+        r['year'] = findRe(r['title'], '\((\d*?)\)')
+        r['title'] = re.sub('\((\d*?)\)', '', r['title']).strip()
+    r['synopsis'] = findRe(data, '<span id="movie_synopsis_all".*?>(.*?)</span>')
+    r['average rating'] = findRe(data, '<div id="bubble_allCritics".*?>(.*?)</div>').strip()
+    return r
+
--- a/ox/web/siteparser.py
+++ b/ox/web/siteparser.py
@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+from datetime import datetime
+
+from ox.cache import readUrlUnicode
+from ox import stripTags, decodeHtml
+
+
+def cleanup(key, data, data_type):
+    if data:
+        if isinstance(data[0], basestring):
+            #FIXME: some types need stripTags
+            #data = [stripTags(decodeHtml(p)).strip() for p in data]
+            data = [decodeHtml(p).strip() for p in data]
+        elif isinstance(data[0], list) or isinstance(data[0], tuple):
+            data = [cleanup(key, p, data_type) for p in data]
+        while len(data) == 1:
+            data = data[0]
+        if data_type == 'list' and isinstance(data, basestring):
+            data = [data, ]
+    elif data_type != 'list':
+        data = ''
+    return data
+
+class SiteParser(dict):
+    baseUrl = ''
+    regex = {}
+
+    def getUrl(self, page):
+        return "%s%s" % (self.baseUrl, page)
+
+    def __init__(self):
+        for key in self.regex:
+            url = self.getUrl(self.regex[key]['page'])
+            data = readUrlUnicode(url)
+            if isinstance(self.regex[key]['re'], basestring):
+                data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
+                data = cleanup(key, data, self.regex[key]['type'])
+            else:
+                for r in self.regex[key]['re']:
+                    if isinstance(data, basestring):
+                        data = re.compile(r, re.DOTALL).findall(data)
+                    else:
+                        data = [re.compile(r, re.DOTALL).findall(d) for d in data]
+                        data = cleanup(key, data, self.regex[key]['type'])
+            def apply_f(f, data):
+                if data and isinstance(data[0], list):
+                    data = [f(d) for d in data]
+                else:
+                    data = f(data)
+                return data            
+            if self.regex[key]['type'] == 'float':
+                data = apply_f(float, data)
+            elif self.regex[key]['type'] == 'int':
+                data = apply_f(int, data)
+            elif self.regex[key]['type'] == 'date':
+                parse_date = lambda d: datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
+                data = apply_f(parse_date, data)
+            self[key] = data
+
--- a/ox/web/spiegel.py
+++ b/ox/web/spiegel.py
@ -0,0 +1,292 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from datetime import datetime
+import re
+import time
+
+import ox.cache
+from ox.html import decodeHtml, stripTags
+import ox.net
+
+
+def getNews(year, month, day):
+    sections = [
+        'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
+        'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
+    ]
+    dt = datetime(year, month, day)
+    day = int(dt.strftime('%j'))
+    date = dt.strftime('%d.%m.%Y')
+    news = []
+    for section in sections:
+        url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
+        if date == time.strftime('%d.%m.%Y', time.localtime()):
+            html = ox.net.readUrl(url)
+        else:
+            html = ox.cache.readUrl(url)
+        for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
+            dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
+            try:
+                description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
+            except:
+                description = ''
+            try:
+                imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
+            except:
+                imageUrl = ''
+            try:
+                title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
+            except:
+                title = ''
+            if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
+                new = {}
+                if len(dateString) == 10:
+                    new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
+                else:
+                    new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
+                # fix decodeHtml
+                # new['description'] = formatString(decodeHtml(description))
+                new['description'] = formatString(description)
+                new['imageUrl'] = imageUrl
+                new['section'] = formatSection(section)
+                new['title'] = formatString(title)
+                new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
+                if new['title1'][-1:] == ':':
+                    new['title1'] = new['title1'][0:-1]
+                new['title2'] = new['title'][len(new['title1']) + 2:]
+                new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
+                if new['url'][:1] == '/':
+                    new['url'] = 'http://www.spiegel.de' + new['url']
+                news.append(new)
+                # print '%s, %s' % (new['section'], dateString)
+            '''
+            elif dateString[:10] == date and not description:
+                print dateString + ' - no description'
+            elif dateString[:10] == date and not imageUrl:
+                print dateString + ' - no image'
+            '''
+    return news
+
+def splitTitle(title):
+    title1 = re.compile('(.*?): ').findall(title)[0]
+    title2 = re.compile(': (.*?)$').findall(title)[0]
+    return [title1, title2]
+
+def formatString(string):
+    string = string.replace('<span class="spOptiBreak"> </span>', '')
+    string = string.replace('\n', ' ').replace('  ', ' ').strip()
+    string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
+    return string
+
+def formatSection(string):
+    return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
+
+def formatSubsection(string):
+    # SPIEGEL, SPIEGEL special
+    subsection = {
+        'abi': 'Abi - und dann?',
+        'formel1': 'Formel 1',
+        'jobundberuf': 'Job & Beruf',
+        'leben': 'Leben U21',
+        'mensch': 'Mensch & Technik',
+        'sonst': '',
+        'staedte': u'St\xc3dte',
+        'ussports': 'US-Sports',
+        'wunderbar': 'wunderBAR'
+    }
+    if subsection.has_key(string):
+        return subsection[string].replace(u'\xc3', 'ae')
+    return string[:1].upper() + string[1:]
+        
+def getIssue(year, week):
+    coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
+    if not ox.net.exists(coverUrl):
+        return None
+    url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
+    contents = []
+    data = ox.cache.readUrl(url)
+    items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
+    for item in items:
+        item = item[1]
+        page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
+        title = stripTags(item).strip()
+        contents.append({'title': title, 'page': page})
+    pageUrl = {}
+    pages = page + 2
+    for page in range(1, pages + 10):
+        url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
+        if ox.cache.exists(url):
+            pageUrl[page] = url
+        else:
+            pageUrl[page] = ''
+    return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
+
+
+def archiveIssues():
+    '''
+    this is just an example of an archiving application
+    '''
+    p = {}
+    import os
+    import simplejson
+    import time
+    archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
+    localtime = time.localtime()
+    year = int(time.strftime('%Y', localtime))
+    week = int(time.strftime('%W', localtime))
+    for y in range(year, 1993, -1):
+        if y == year:
+            wMax = week + 1
+        else:
+            wMax = 53
+        for w in range(wMax, 0, -1):
+            print 'getIssue(%d, %d)' % (y, w)
+            issue = getIssue(y, w)
+            if issue:
+                dirname = '%s/%d/%02d' % (archivePath, y, w)
+                if not os.path.exists(dirname):
+                    os.makedirs(dirname)
+                filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
+                if not os.path.exists(filename):
+                    data = simplejson.dumps(issue, ensure_ascii = False)
+                    f = open(filename, 'w')
+                    f.write(data)
+                    f.close()
+                filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
+                if not os.path.exists(filename):
+                    data = []
+                    for item in issue['contents']:
+                        data.append('%3d %s' % (item['page'], item['title']))
+                    data = '\n'.join(data)
+                    f = open(filename, 'w')
+                    f.write(data)
+                    f.close()
+                filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
+                if not os.path.exists(filename):
+                    data = ox.cache.readUrl(issue['coverUrl'])
+                    f = open(filename, 'w')
+                    f.write(data)
+                    f.close()
+                for page in issue['pageUrl']:
+                    url = issue['pageUrl'][page]
+                    if url:
+                        filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
+                        if not os.path.exists(filename):
+                            data = ox.cache.readUrl(url)
+                            f = open(filename, 'w')
+                            f.write(data)
+                            f.close()
+                if not p:
+                    p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
+                else:
+                    p['num'] += 1
+                    p['sum'] += issue['pages']
+                    if issue['pages'] < p['min']:
+                        p['min'] = issue['pages']
+                    if issue['pages'] > p['max']:
+                        p['max'] = issue['pages']
+                print p['min'], p['sum'] / p['num'], p['max']
+            
+
+def archiveNews():
+    '''
+    this is just an example of an archiving application
+    '''
+    import os
+    import simplejson
+    import time
+
+    count = {}
+    colon = []
+
+    archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
+    days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+    localtime = time.localtime()
+    year = int(time.strftime('%Y', localtime))
+    month = int(time.strftime('%m', localtime))
+    day = int(time.strftime('%d', localtime)) - 1
+    for y in range(year, 1999, -1):
+        if y == year:
+            mMax = month
+        else:
+            mMax = 12
+        for m in range(mMax, 0, -1):
+            if y == year and m == month:
+                dMax = day
+            elif m == 2 and y % 4 == 0 and y % 400 != 0:
+                dMax = days[m] + 1
+            else:
+                dMax = days[m]
+            for d in range(dMax, 0, -1):
+                print 'getNews(%d, %d, %d)' % (y, m, d)
+                news = getNews(y, m ,d)
+                for new in news:
+                    dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
+                    if not os.path.exists(dirname):
+                        os.makedirs(dirname)
+                    if new['url'][-5:] == '.html':
+                        filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
+                    else:
+                        filename = dirname + '/' + new['url'] + '.json'
+                    if not os.path.exists(filename) or True:
+                        data = simplejson.dumps(new, ensure_ascii = False)
+                        f = open(filename, 'w')
+                        f.write(data)
+                        f.close()
+                    filename = filename[:-5] + '.txt'
+                    if not os.path.exists(filename) or True:
+                        data = splitTitle(new['title'])
+                        data.append(new['description'])
+                        data = '\n'.join(data)
+                        f = open(filename, 'w')
+                        f.write(data)
+                        f.close()
+                    filename = dirname + '/' + new['imageUrl'].split('/')[-1]
+                    if not os.path.exists(filename):
+                        data = ox.cache.readUrl(new['imageUrl'])
+                        f = open(filename, 'w')
+                        f.write(data)
+                        f.close()
+
+                    strings = new['url'].split('/')
+                    string = strings[3]
+                    if len(strings) == 6:
+                        string += '/' + strings[4]
+                    if not count.has_key(string):
+                        count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
+                    else:
+                        count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
+                    strings = splitTitle(new['title'])
+                    if strings[0] != new['title1'] or strings[1] != new['title2']:
+                        colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
+            for key in sortDictByKey(count):
+                print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
+            for value in colon:
+                print value
+
+def sortDictByKey(d):
+    keys = d.keys()
+    keys.sort()
+    return keys
+
+if __name__ == '__main__':
+    # spiegel = Spiegel(2008, 8)
+    # print spiegel.getContents()
+    # news = News(2001, 9, 10)
+    # output(news.getNews())
+    '''
+    x = []
+    for d in range(10, 30):
+        print '2/%d' % d
+        news = getNews(2008, 2, d)
+        for new in news:
+            strings = new['url'].split('/')
+            string = formatSection(strings[3])
+            if len(strings) == 6:
+                string += '/' + formatSubsection(strings[4])
+            if not string in x:
+                x.append(string)
+        print x
+    '''
+    # archiveIssues()
+    archiveNews()
--- a/ox/web/thepiratebay.py
+++ b/ox/web/thepiratebay.py
@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from datetime import datetime
+import re
+import socket
+from urllib import quote, urlencode
+from urllib2 import URLError
+
+from ox.cache import readUrl, readUrlUnicode
+from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
+from ox.normalize import normalizeImdbId
+import ox
+
+from torrent import Torrent
+
+cache_timeout = 24*60*60 # cache search only for 24 hours
+
+season_episode = re.compile("S..E..", re.IGNORECASE)
+
+
+def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
+    headers = headers.copy()
+    headers['Cookie'] = 'language=en_EN'
+    return cache.readUrl(url, data, headers, timeout)
+
+def _readUrlUnicode(url, timeout=cache.cache_timeout):
+   return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout)
+
+def findMovies(query, max_results=10):
+    results = []
+    next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
+    page_count = 1
+    while next and page_count < 4:
+        page_count += 1
+        url = next[0]
+        if not url.startswith('http'):
+            if not url.startswith('/'):
+                url = "/" + url
+            url = "http://thepiratebay.org" + url
+        data = _readUrlUnicode(url, timeout=cache_timeout)
+        regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
+        for row in  re.compile(regexp, re.DOTALL).findall(data):
+            torrentType = row[0]
+            torrentLink = "http://thepiratebay.org" + row[1]
+            torrentTitle = decodeHtml(row[2])
+            # 201 = Movies , 202 = Movie DVDR, 205 TV Shows
+            if torrentType in ['201']:
+                results.append((torrentTitle, torrentLink, ''))
+            if len(results) >= max_results:
+                return results
+        next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
+    return results
+
+def findMovieByImdb(imdb):
+    return findMovies("tt" + normalizeImdbId(imdb))
+
+def getId(piratebayId):
+    if piratebayId.startswith('http://torrents.thepiratebay.org/'):
+        piratebayId = piratebayId.split('org/')[1]
+    d = findRe(piratebayId, "tor/(\d+)")
+    if d:
+        piratebayId = d
+    d = findRe(piratebayId, "torrent/(\d+)")
+    if d:
+        piratebayId = d
+    return piratebayId
+
+def exists(piratebayId):
+    piratebayId = getId(piratebayId)
+    return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
+
+def getData(piratebayId):
+    _key_map = {
+      'spoken language(s)': u'language',
+      'texted language(s)': u'subtitle language',
+      'by': u'uploader',
+      'leechers': 'leecher',
+      'seeders': 'seeder',
+    }
+    piratebayId = getId(piratebayId)
+    torrent = dict()
+    torrent[u'id'] = piratebayId
+    torrent[u'domain'] = 'thepiratebay.org'
+    torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
+
+    data = _readUrlUnicode(torrent['comment_link'])
+    torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
+    if not torrent[u'title']:
+        return None
+    torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
+    torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
+    title = quote(torrent['title'].encode('utf-8'))
+    torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
+    for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
+        key = d[0].lower().strip()
+        key = _key_map.get(key, key)
+        value = decodeHtml(stripTags(d[1].strip()))
+        torrent[key] = value
+    torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
+    if torrent[u'description']:
+        torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
+    t = _readUrl(torrent[u'torrent_link'])
+    torrent[u'torrent_info'] = getTorrentInfo(t)
+    return torrent
+
+class Thepiratebay(Torrent):
+    '''
+    >>> Thepiratebay('123')
+    {}
+
+    >>> Thepiratebay('3951349')['infohash']
+    '4e84415d36ed7b54066160c05a0b0f061898d12b'
+    '''
+    def __init__(self, piratebayId):
+        self.data = getData(piratebayId)
+        if not self.data:
+            return
+        Torrent.__init__(self)
+        published =  self.data['uploaded']
+        published = published.replace(' GMT', '').split(' +')[0]
+        self['published'] =  datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
+
--- a/ox/web/torrent.py
+++ b/ox/web/torrent.py
@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from ox import intValue
+
+
+class Torrent(dict):
+    '''
+    >>> Torrent()
+    {'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
+    '''
+    _string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link', 
+                   'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
+    _int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
+    _dict_keys = ('torrent_info', )
+    _list_keys = ()
+    data = {'torrent_info': {}}
+
+    def __init__(self):
+        for key in self._string_keys:
+            self[key] = self.data.get(key, u'')
+        for key in self._dict_keys:
+            self[key] = self.data.get(key, {})
+        for key in self._list_keys:
+            self[key] = self.data.get(key, [])
+        for key in self._int_keys:
+            value = self.data.get(key, -1)
+            if not isinstance(value, int):
+                value = int(intValue(value))
+            self[key] = value
+        self['infohash'] = self.data['torrent_info'].get('hash', '')
+        self['size'] = self.data['torrent_info'].get('size', -1)
+        self['announce'] = self.data['torrent_info'].get('announce', '')
+        if 'files' in self.data['torrent_info']:
+            self['files'] = len(self.data['torrent_info']['files'])
+        else:
+            self['files'] =  1
+
--- a/ox/web/tv.py
+++ b/ox/web/tv.py
@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+import time
+
+from ox import stripTags, findRe
+from ox.cache import readUrlUnicode
+
+
+def getEpisodeData(url):
+    '''
+      prases informatin on tvcom episode pages
+      returns dict with title, show, description, score
+      example:
+        getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
+    '''
+    data = readUrlUnicode(url)
+    r = {}
+    r['description'] = stripTags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
+    r['show'] = findRe(data, '<h1>(.*?)</h1>')
+    r['title'] =  findRe(data, '<title>.*?: (.*?) - TV.com  </title>')
+    #episode score
+    r['episode score'] = findRe(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
+
+    match = re.compile('Episode Number: (\d*?) &nbsp;&nbsp; Season Num: (\d*?) &nbsp;&nbsp; First Aired: (.*?) &nbsp').findall(data) 
+    if match:
+        r['season'] = int(match[0][1])
+        r['episode'] = int(match[0][0])
+        #'Wednesday September 29, 2004' -> 2004-09-29 
+        r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
+    return r
+
--- a/ox/web/wikipedia.py
+++ b/ox/web/wikipedia.py
@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from urllib import urlencode
+
+import simplejson
+from ox.cache import readUrl, readUrlUnicode
+from ox import findRe, decodeHtml
+
+
+def getId(url):
+    return url.split("/")[-1]
+
+def getUrl(id):
+    return "http://en.wikipedia.org/wiki/%s" % id
+
+
+def getMovieId(title, director='', year=''):
+    query = '"%s" film %s %s' % (title, director, year)
+    result = find(query, 1)
+    if result:
+        return result[0][1]
+    return ''
+
+def getUrlByImdbId(imdbId):
+    query = '"%s"'% imdbId
+    result = find(query)
+    if result:
+        url = result[0][1]
+        return url
+    return ""
+
+def getUrlByImdb(imdbId):
+    # deprecated, use getUrlByImdbId()
+    return getUrlByImdbId(imdbId)
+
+def getUrlByAllmovieId(allmovieId):
+    query = '"amg_id = 1:%s"'% allmovieId
+    result = find(query)
+    if result:
+        url = result[0][1]
+        return url
+    return ''
+
+def getWikiData(wikipediaUrl):
+    url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
+    url = "%s&action=raw" % url
+    data = readUrlUnicode(url)
+    return data
+
+def getMovieData(wikipediaUrl):
+    if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
+    data = getWikiData(wikipediaUrl)
+    filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
+    filmbox = {}
+    _box = filmbox_data.strip().split('\n|')
+    if len(_box) == 1:
+        _box = _box[0].split('|\n')
+    for row in _box:
+        d = row.split('=')
+        if len(d) == 2:
+            key = d[0].strip()
+            if key[0] == '|':
+                key = key[1:]
+            value = d[1].strip()
+            filmbox[key] = value
+    if 'imdb title' in data:
+        filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|')
+    elif 'imdb episode' in data:
+        filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|')
+    if 'Amg movie' in data:
+        filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|')
+    if 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'):
+        filmbox['amg_id'] = filmbox['amg_id'][2:]
+
+    if 'rotten-tomatoes' in data:
+        filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|')
+        if not filmbox['rottentomatoes_id']:
+            filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|')
+    if 'google video' in data:
+        filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|')
+    if 'DEFAULTSORT' in data:
+        filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
+    return filmbox
+
+def getImageUrl(name):
+    data = readUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name)
+    url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
+    return url
+
+def getPosterUrl(wikipediaUrl):
+    if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
+    data = getMovieData(wikipediaUrl)
+    if 'image' in data:
+        return getImageUrl(data['image'])
+    return ''
+
+def getMoviePoster(wikipediaUrl):
+    # deprecated, use getPosterUrl()
+    return getPosterUrl(wikipediaUrl)
+
+def getAllmovieId(wikipediaUrl):
+    data = getMovieData(wikipediaUrl)
+    return data.get('amg_id', '')
+
+def find(query, max_results=10):
+    query = {'action': 'query', 'list':'search', 'format': 'json',
+             'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
+    url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
+    data = readUrl(url)
+    if not data:
+        data  = readUrl(url, timeout=0)
+    result = simplejson.loads(data)
+    results = []
+    if result and 'query' in result:
+        for r in result['query']['search']:
+            title = r['title']
+            url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
+            results.append((title, url, ''))
+    return results
+
--- a/ox/web/youtube.py
+++ b/ox/web/youtube.py
@ -0,0 +1,107 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from urllib import quote, unquote
+import httplib
+import xml.etree.ElementTree as ET
+import re
+
+import feedparser
+from ox.cache import readUrl, readUrlUnicode
+from ox import findString, findRe
+
+
+def getVideoKey(youtubeId):
+    data = readUrl("http://www.youtube.com/get_video_info?&video_id=%s" % youtubeId)
+    match = re.compile("token=(.+)&thumbnail").findall(data)
+    if match:
+        return unquote(match[0])
+    return False
+ 
+def getVideoUrl(youtubeId, format='mp4'):
+    youtubeKey = getVideoKey(youtubeId)
+    if format == '1080p':
+        fmt=37
+        url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
+    if format == '720p':
+        fmt=22
+        url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
+    elif format == 'mp4':
+        fmt=18
+        url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
+    elif format == 'high':
+        fmt=35
+        url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
+    else:
+        url = "http://youtube.com/get_video.php?video_id=%s&t=%s" % (youtubeId, youtubeKey)
+    return url
+
+def getMovieInfo(youtubeId, video_url_base=None):
+    url = "http://gdata.youtube.com/feeds/api/videos/%s" % youtubeId
+    data = readUrl(url)
+    fd = feedparser.parse(data)
+    return getInfoFromAtom(fd.entries[0], video_url_base)
+
+def getInfoFromAtom(entry, video_url_base=None):
+    info = dict()
+    info['title'] = entry['title']
+    info['description'] = entry['description']
+    info['author'] = entry['author']
+    #info['published'] = entry['published_parsed']
+    if 'media_keywords' in entry:
+        info['keywords'] = entry['media_keywords'].split(', ')
+    info['url'] = entry['links'][0]['href']
+    info['id'] = findString(info['url'], "/watch?v=") 
+    info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
+    if video_url_base:
+        info['flv'] = "%s/%s.%s" % (video_url_base, info['id'], 'flv')
+        info['mp4'] = "%s/%s.%s" % (video_url_base, info['id'], 'mp4')
+    else:
+        info['flv'] = getVideoUrl(info['id'], 'flv')
+        info['flv_high'] = getVideoUrl(info['id'], 'high')
+        info['mp4'] = getVideoUrl(info['id'], 'mp4')
+        info['720p'] = getVideoUrl(info['id'], '720p')
+        info['1080p'] = getVideoUrl(info['id'], '1080p')
+    info['embed'] = '<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>' % (info['id'], info['id'])
+    return info
+
+def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
+    query = quote(query)
+    url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
+    data = readUrlUnicode(url)
+    fd = feedparser.parse(data)
+    videos = []
+    for entry in fd.entries:
+        v = getInfoFromAtom(entry, video_url_base)
+        videos.append(v)
+        if len(videos) >= max_results:
+            return videos
+    return videos
+
+'''
+def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
+  url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
+  data = readUrlUnicode(url)
+  regx = re.compile(' <a href="/watch.v=(.*?)" title="(.*?)" ')
+  regx = re.compile('<a href="/watch\?v=(\w*?)" ><img src="(.*?)"  class="vimg120" title="(.*?)" alt="video">')
+  id_title = regx.findall(data)
+  data_flat = data.replace('\n', ' ')
+  videos = {}
+  for video in id_title:
+    vid = video[0]
+    if vid not in videos:
+      v = dict()
+      v['id'] = vid
+      v['link'] = "http//youtube.com/watch.v=%s" % v['id']
+      v['title'] = video[2].strip()
+      if video_url_base:
+        v['video_link'] = "%s/%s" % (video_url_base, v['id'])
+      else:
+        v['video_url'] = getVideoUrl(v['id'])
+      v['description'] = findRe(data, 'BeginvidDesc%s">(.*?)</span>' % v['id']).strip().replace('<b>', ' ').replace('</b>', '')
+      v['thumbnail'] = video[1]
+    videos[vid] = v
+    if len(videos) >= max_results:
+        return videos.values()
+  return videos.values()
+'''
+