rename ox -> oxweb

2008-07-03 11:21:18 +02:00 · 2008-07-03 11:21:18 +02:00 · 6a16a0af30
commit 6a16a0af30
parent c466c35253
17 changed files with 7 additions and 5 deletions
--- a/oxweb/init.py
+++ b/oxweb/init.py
@ -0,0 +1,8 @@
+# vi:si:et:sw=4:sts=4:ts=4
+# encoding: utf-8
+__version__ = '0.1.0'
+
+import imdb
+import wikipedia
+import google
+
--- a/oxweb/criterion.py
+++ b/oxweb/criterion.py
@ -0,0 +1,64 @@
+# -*- coding: UTF-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+
+from oxutils.cache import getUrlUnicode
+from oxutils.html import stripTags
+from oxutils.text import findRe, removeSpecialCharacters
+
+import imdb
+
+
+def getData(criterionId):
+    '''
+    >>> getData(348)['imdbId']
+    '0068205'
+    '''
+    data = {}
+    html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % criterionId)
+    data['criterionId'] = criterionId
+    data['posterUrl'] = getPosterUrl(criterionId)
+    data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
+    result = re.compile("<title>The Criterion Collection: (.*?) by (.*?)</title>").findall(html)
+    data['title'] = stripTags(result[0][0])
+    data['director'] = stripTags(result[0][1])
+    data['imdbId'] = imdb.getMovieId(data['title'], data['director'])
+    return data
+
+def getCriterionIds():
+    html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine')
+    return re.compile('release.asp\?id=(.*?)"').findall(html)
+
+def getPosterUrl(criterionId):
+    return 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % criterionId
+
+def getMovieId(title = '', director = '', imdbId = ''):
+    if not imdbId:
+        imdbId = imdb.getMovieId(title, director)
+    html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine', timeout = -1)
+    strings = findRe(html, '<table cellspacing="0" id="browse-all-table">(.*?)</table>').split('<tr>')
+    strings.pop(0)
+    for string in strings:
+        id = findRe(string, '"release.asp\?id=(.*?)"')
+        criterionTitle = findRe(string, 'class="title">(.*?)</a>')
+        criterionTitle = re.sub('(?<=\\w)<br>(?=\\w)', ' / ', criterionTitle)
+        criterionTitle = criterionTitle.replace('<br>', '')
+        criterionDirector = stripTags(findRe(string, '</a>.*?</td>(.*?)</td>')).strip()
+        if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId:
+            return id
+    return ''
+
+def getMovieData(title = '', director = '', imdbId = ''):
+    data = {}
+    if not imdbId:
+        imdbId = imdb.getMovieId(title, director)
+    id = getMovieId(imdbId = imdbId)
+    if id:
+        html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id)
+        data['id'] = id
+        data['posterUrl'] = getPosterUrl(id)
+        data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
+    return data
+
+if __name__ == '__main__':
+    print getMovieData('Le mepris', 'Jean-Luc Godard')
--- a/oxweb/dailymotion.py
+++ b/oxweb/dailymotion.py
@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+from urllib import unquote
+from oxutils.cache import getUrl
+
+
+def getVideoUrl(url):
+    '''
+    >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
+    'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0'
+
+    >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
+    'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4'
+    '''
+    data = getUrl(url)
+    video = re.compile('''video", "(.*?)"''').findall(data)
+    for v in video:
+       v =  unquote(v).split('@@')[0]
+       return "http://www.dailymotion.com" + v
+    return ''
+
--- a/oxweb/google.py
+++ b/oxweb/google.py
@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+import time
+import urllib
+import urllib2
+import weakref
+import threading
+import Queue
+
+import oxutils 
+from oxutils import stripTags
+
+
+'''
+usage:
+import google
+google.find(query)
+
+for result in google.find(query): result
+
+result is title, url, description
+
+google.find(query, max_results)
+
+FIXME: how search depper than first page?
+'''
+DEFAULT_MAX_RESULTS = 10
+
+def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
+    google_timeout=24*60*60
+    return oxutils.cache.getUrl(url, data, headers, google_timeout)
+
+def quote_plus(s):
+    return urllib.quote_plus(s.encode('utf-8'))
+
+def find(query, max_results=DEFAULT_MAX_RESULTS):
+    url = "http://www.google.com/search?q=%s" % quote_plus(query)
+    data = getUrl(url)
+    link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +  \
+              r'.*?(?:<br>|<table.*?>)' +  \
+              r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
+    results = []
+    for match in re.compile(link_re, re.DOTALL).finditer(data):
+        (name, url, desc) = match.group('name', 'url', 'desc')
+        results.append((stripTags(name), url, stripTags(desc)))
+    if len(results) > max_results:
+        results = results[:max_results]
+    return results
+
--- a/oxweb/imdb.py
+++ b/oxweb/imdb.py
@ -0,0 +1,670 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import urllib2
+from urllib import quote, unquote
+import re
+import os
+import time
+
+from BeautifulSoup import BeautifulSoup
+import chardet
+import oxutils
+from oxutils import stripTags, decodeHtml, findRe, findString
+from oxutils.cache import getUrl, getUrlUnicode
+from oxutils.normalize import normalizeTitle, normalizeImdbId
+from oxutils import *
+
+import google
+
+
+def getMovieId(title, director='', year=''):
+    '''
+    >>> getMovieId('The Matrix')
+    '0133093'
+    '''
+    if year:
+        title = "%s (%s)" % (title, year)
+    if director:
+        query = 'site:imdb.com %s "%s"' % (director, title)
+    else:
+        query = 'site:imdb.com "%s"' % title
+    for (name, url, desc) in google.find(query, 3):
+        if url.startswith('http://www.imdb.com/title/tt'):
+            return url[28:35]
+    return ''
+
+def getMovieData(imdbId):
+    return IMDb(imdbId).parse()
+
+# internal functions below
+def getUrlBase(imdbId):
+    return "http://www.imdb.com/title/tt%s" % imdbId
+
+def getRawMovieData(imdbId):
+    imdbId = normalizeImdbId(imdbId)
+    data = getMovieInfo(imdbId)
+    data['credits'] = getMovieCredits(imdbId)
+    data['poster'] = getMoviePoster(imdbId)
+    data['company credits'] = getMovieCompanyCredits(imdbId)
+    data['filming locations'] = getMovieLocations(imdbId)
+    data['movie connections'] = getMovieConnections(imdbId)
+    data['external reviews'] = getMovieExternalReviews(imdbId)
+    data['trivia'] = getMovieTrivia(imdbId)
+    data['keywords'] = getMovieKeywords(imdbId)
+    data['media'] = {}
+    data['media']['images'] = getMovieImages(imdbId)
+    data['media']['trailers'] = getMovieTrailers(imdbId)
+    data['plotsummary'] = getMoviePlot(imdbId)
+    data['release dates'] = getMovieReleaseDates(imdbId)
+    data['release date'] = getMovieReleaseDate(imdbId)
+    return data
+
+def getMovieInfo(imdbId):
+    data = getUrlUnicode(getUrlBase(imdbId))
+    info = dict()
+    info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
+    if info['poster'] and '_V' in info['poster']:
+        info['poster']= "%s.jpg" % info['poster'].split('._V')[0]
+
+    for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
+        title = stripTags(i[0]).strip().lower()
+        txt= stripTags(i[1]).strip()
+        def cleanUp(k):
+            k = decodeHtml(k).replace(u'\xa0', ' ').strip()
+            if k.endswith('more'): k=k[:-len('more')].strip()
+            return k
+        txt = cleanUp(txt)
+        if title not in ('plot', 'trivia', 'filming locations', 'mpaa'):
+            if '|' in txt:
+                txt = [cleanUp(k) for k in txt.split('|')]
+            elif ', ' in txt:
+                txt = [cleanUp(k) for k in txt.split(', ')]
+        if not title.startswith('moviemeter'):
+            info[title] = txt
+    for key in ('user comments', 'writers (wga)'):
+       if key in info:
+          del info[key]
+    if 'release date' in info:
+        info['release date'] = info['release date'].split('\n')[0]
+    if 'plot' in info:
+        info['plot'] = info['plot'].split('| add synopsis')[0].strip()
+        info['plot'] = info['plot'].split('| full synopsis')[0].strip()
+        if info['plot'] in ('add synopsis', 'full synopsis'):
+            info['plot'] = ''
+
+    #get Title
+    title = ''
+    year = ''
+    html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
+    if not html_title:
+        html_title = findRe(data, '<title>(.*?)</title>')
+    if html_title:
+        html_title = html_title.replace('<br />', ' ').replace('  ', ' ')
+        title = decodeHtml(html_title)
+        title = stripTags(title)
+        year = findRe(title, '\((\d{4})\)')
+        if not year:
+            year = findRe(title, '\((\d{4})')
+        _y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))')
+        if _y:
+            title = title.replace(_y, '')
+        for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
+            title = title.replace(t, '')
+    title = title.strip()
+    if title.find(u'\xa0') > -1:
+        title = title[:title.find(u'\xa0')].strip()
+    if title.startswith('"') and title.endswith('"'):
+        title = title[1:-1]
+    info['title'] = title
+    info['year'] = year
+
+    #Rating
+    rating = findRe(data, '<b>([\d\.]*?)/10</b>')
+    if rating:
+        info['rating'] = float(rating)
+    else:
+        info['rating'] = -1
+
+    #Votes
+    votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
+    if votes:
+        info['votes'] = int(votes.replace(',', ''))
+    else:
+        info['votes'] = -1
+    return info
+
+def getMoviePoster(imdbId):
+    info = getMovieInfo(imdbId)
+    return info['poster']
+
+def getMovieYear(imdbId):
+    info = getMovieInfo(imdbId)
+    return info['year']
+
+def getMovieTitle(imdbId):
+    info = getMovieInfo(imdbId)
+    return info['title']
+
+def creditList(data, section=None):
+    if section == 'cast':
+        credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
+    else:
+        credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
+    credits = []
+    for c_ in credits_:
+        c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()]
+        if section=='writers':
+            c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
+            if c[1].endswith(' and'): c[1] = c[1][:-4]
+        credits.append(c)
+    return credits
+
+def getMovieCredits(imdbId):
+    credits = dict()
+    url = "%s/fullcredits" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    groups = data.split('<h5>')
+    for g in groups:
+        section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
+        if section:
+            credits[section[0]] = creditList(g, section[0])
+    return credits
+
+def getMovieTrailers(imdbId):
+    url = "%s/trailers" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    soup = BeautifulSoup(data)
+    videos = soup('div', {'class':"video-gallery"})
+    trailers = []
+    if videos:
+        for a in videos[0]('a'):
+            title = stripTags(unicode(a)).strip()
+            url = 'http://www.imdb.com' + a['href']
+            videoId = findRe(url, '/(vi\d*?)/')
+            iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
+            iframe = getUrlUnicode(iframeUrl)
+            videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
+            trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
+    return trailers
+
+def getMovieQuotes(imdbId):
+    url = "%s/quotes" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
+    quotes = [(q[0].strip(),q[1].strip())  for q in quotes]
+    return quotes
+
+def getMoviePlot(imdbId):
+    url = "%s/plotsummary" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    plot = findRe(data, '<p class="plotpar">(.*?)<i>')
+    return plot
+
+def getMovieTechnical(imdbId):
+    url = "%s/technical" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    results = {}
+    for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
+        results[t[0].strip()] = t[1].strip()
+    return results
+
+def getMovieCompanyCredits(imdbId):
+    url = "%s/companycredits" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    results = {}
+    for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
+        results[field.strip()] = []
+        for company in re.compile('<li>(.*?)</li>').findall(c):
+            results[field.strip()].append(company)
+    return results
+
+def getMovieLocations(imdbId):
+    url = "%s/locations" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    soup = BeautifulSoup(data)
+    locations = []
+    for key in soup('a', {'href': re.compile('^/List')}):
+        locations.append(decodeHtml(key.string))
+    return locations
+
+def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
+    photos = {}
+    for key in keys:
+        url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
+        data = getUrlUnicode(url)
+        photos[key] = {}
+        for s in  re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
+            img= "%s.jpg" % s[1].split('._V')[0]
+            title = s[0]
+            if key=='still_frame':
+                if not "_CR0" in s[1]:
+                    photos[key][img] = title
+            else:
+                photos[key][img] = title
+    return photos
+
+def getMovieStills(imdbId):
+    return getMovieImages(imdbId, ['still_frame'])['still_frame']
+
+def getMoviePosters(imdbId):
+    posters = getMovieImages(imdbId, ['poster'])['poster']
+    poster = getMoviePoster(imdbId)
+    if poster:
+        posters[poster] = 'main poster'
+    return posters
+  
+def getMovieTrivia(imdbId):
+    url = "%s/trivia" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    soup = BeautifulSoup(data)
+    trivia = []
+    triviaList = []
+    for i in  soup('ul', {'class': "trivia"}):
+        for t in i('li'):
+            t = unicode(t).replace('<br />', '').strip()
+            if t.startswith('<li>') and t.endswith('</li>'):
+                t = t[4:-5].strip()
+            t=decodeHtml(t)
+            trivia.append(t)
+    return trivia
+
+def getMovieConnections(imdbId):
+    url = "%s/movieconnections" % getUrlBase(imdbId)
+    data = getUrl(url)
+    connections={}
+    for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
+        connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
+    return connections
+
+def getMovieKeywords(imdbId):
+    url = "%s/keywords" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    keywords = []
+    for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
+        keyword = decodeHtml(keyword)
+        keyword = keyword.replace(u'\xa0', ' ')
+        keywords.append(keyword)
+    return keywords
+
+def getMovieExternalReviews(imdbId):
+    url = "%s/externalreviews" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    soup = BeautifulSoup(data)
+    ol = soup('ol')
+    if ol:
+        ol = ol[0]
+        ret = {}
+        for li in ol('li'):
+            try:
+                a = li('a')[0]
+                href = a.get('href')
+                txt = a.contents[0]
+                ret[href] = txt
+            except:
+                pass
+        return ret
+    return {}
+
+def getMovieReleaseDate(imdbId):
+    releasedates = getMovieReleaseDates(imdbId)
+    first_release = None
+    for r in releasedates:
+        if not first_release or r[1] < first_release:
+            first_release = r[1]
+    return first_release
+
+def _parseDate(d):
+    try:
+        parsed_date = time.strptime(d, "%d %B %Y")
+        parsed_date = time.strftime('%Y-%m-%d', parsed_date)
+        return parsed_date
+    except:
+        try:
+            parsed_date = time.strptime(d, "%B %Y")
+            parsed_date = time.strftime('%Y-%m-01', parsed_date)
+            return parsed_date
+        except:
+            pass
+        try:
+            parsed_date = time.strptime(d, "%Y")
+            parsed_date = time.strftime('%Y-01-01', parsed_date)
+            return parsed_date
+        except:
+            pass
+    return d
+
+def getMovieReleaseDates(imdbId):
+    url = "%s/releaseinfo" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    releasedates = []
+    regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
+
+    for r in re.compile(regexp, re.DOTALL).findall(data):
+        r_ = (stripTags(r[0]).strip(),
+              _parseDate(stripTags(r[1]).strip()),
+              decodeHtml(stripTags(r[2]).strip()))
+        releasedates.append(r_)
+    return releasedates
+
+def getMovieBusinessSum(imdbId):
+    business = getMovieBusiness(imdbId)
+    b_ = {'budget': 0, 'gross': 0, 'profit': 0}
+    if 'budget' in business:
+        b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
+    if 'gross' in business:
+        b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
+        if 'weekend gross' in business:
+            b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
+    if b_['budget'] and b_['gross']:
+        b_['profit'] = b_['gross'] - b_['budget']
+    return b_
+
+def getMovieFlimingDates(imdbId):
+    business = getMovieBusiness(imdbId)
+    if 'filming dates' in business and business['filming dates']:
+        return business['filming dates'][0]
+    return ''
+
+def getMovieBusiness(imdbId):
+    url = "%s/business" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    business = {}
+    for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
+        key = stripTags(r[0]).strip().lower()
+        value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('<br/>')]
+        business[key] = value
+    return business
+
+def getMovieEpisodes(imdbId):
+    url = "%s/episodes" % getUrlBase(imdbId)
+    data = getUrlUnicode(url)
+    episodes = {}
+    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
+    for r in re.compile(regexp, re.DOTALL).findall(data):
+        try:
+            episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
+            episodes[episode] = {}
+            episodes[episode]['imdb'] = r[2]
+            episodes[episode]['title'] = r[3].strip()
+            if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
+                episodes[episode]['title'] = u''
+            description = decodeHtml(r[5])
+            description = stripTags(description.split('Next US airings:')[0])
+            episodes[episode]['description'] = description.strip()
+            episodes[episode]['date'] = ''
+            try:
+                d = stripTags(r[4])
+                d = d.replace('Original Air Date: ', '')
+                d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
+                episodes[episode]['date'] = d
+            except:
+                pass
+        except:
+            import traceback
+            print traceback.print_exc()
+            pass
+    return episodes
+
+'''the old code below'''
+
+class IMDb:
+    def __init__(self, imdbId):
+        self.imdb = imdbId
+        self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
+
+    def getPage(self):
+        return getUrlUnicode(self.pageUrl)
+
+    def parse_raw_value(self, key, value):
+        if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
+            value = stripTags(value).strip()
+        if key == 'runtime':
+            parsed_value = findRe(value, '(.*?) min')
+            parsed_value = findRe(parsed_value, '([0-9]+)')
+            if not parsed_value:
+                parsed_value = findRe(value, '(.*?) sec')
+                parsed_value = findRe(parsed_value, '([0-9]+)')
+                if not parsed_value:
+                    parsed_value = 0
+                else:
+                    parsed_value = int(parsed_value)
+            else:
+                parsed_value = int(parsed_value) * 60
+        elif key in ('country', 'language'):
+            parsed_value = value.split(' / ')
+            if len(parsed_value) == 1:
+                parsed_value = parsed_value[0].split(' | ')
+            parsed_value = [v.strip() for v in parsed_value]
+        elif key == 'genre':
+            parsed_value = value.replace('more', '').strip().split(' / ')
+            if len(parsed_value) == 1:
+                parsed_value = parsed_value[0].split(' | ')
+            parsed_value = [v.strip() for v in parsed_value]
+        elif key == 'tagline':
+            parsed_value = value.replace('more', '').strip()
+        elif key == 'plot_outline':
+            parsed_value = value.replace('(view trailer)', '').strip()
+            if parsed_value.endswith('more'):
+                parsed_value = parsed_value[:-4].strip()
+        elif key == 'tv_series':
+            m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
+            if m:
+                parsed_value = m[0][0]
+            else:
+                parsed_value = ''
+        elif key == 'also_known_as':
+            parsed_value = ''
+            m = re.compile('(.*) \(International: English title').findall(value)
+            if m:
+                parsed_value = m[0]
+            else:
+                m = re.compile('(.*) \(USA').findall(value)
+                if m:
+                    parsed_value = m[0]
+            parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
+            director = self.getCredits().get('director', None)
+            if director:
+                director = director[0]
+                parsed_value = parsed_value.replace(director, '')
+            if parsed_value.startswith("'s"):
+                parsed_value = parsed_value[2:].strip()
+            parsed_value = decodeHtml(parsed_value.strip())
+        else:
+            print value
+            parsed_value = value
+        return parsed_value
+
+    def parseTitle(self):
+        title = getMovieTitle(self.imdb)
+        title = normalizeTitle(title)
+        if title.startswith('"') and title.find('"',1) > 0 and \
+            title.find('"',1) == title.rfind('"'):
+            data = self.getPage()
+            se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
+            if se:
+                se = se[0]
+                se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1]))
+                title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip()
+            else:
+                part2 = title[title.rfind('"')+1:]
+                part2 = re.sub("[\d\?-]", "", part2).strip()
+                title = normalizeTitle(title[1:title.rfind('"')])
+                if part2:
+                    title += ':' + part2
+        return normalizeTitle(title)
+
+    def parseYear(self):
+        year = ''
+        data = self.getPage()
+        soup = BeautifulSoup(data)
+        html_title = soup('div', {'id': 'tn15title'})
+        if not html_title:
+            html_title = soup('title')
+        if html_title:
+            html_title = unicode(html_title[0])
+            html_title = stripTags(html_title)
+            year = re.compile('\((\d{4})\)').findall(html_title)
+            if not year:
+                year = re.compile('\((\d{4})/').findall(html_title)
+            if year:
+                year = year[0]
+            else: year = ''
+        return year
+
+    def parse(self):
+        data = self.getPage()
+        IMDbDict ={}
+        #Poster
+        IMDbDict['poster'] = getMoviePoster(self.imdb)
+        if not IMDbDict['poster']:
+            IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
+        #Title, Year
+        IMDbDict['year'] = self.parseYear()
+        IMDbDict['title'] = self.parseTitle()
+
+        #Rating
+        m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
+        if m:
+            IMDbDict['rating'] = int(float(m.group(1)) * 1000)
+        else:
+            IMDbDict['rating'] = -1
+        #Votes
+        m = re.compile('<small>\(<a href="ratings">(.*?) votes</a>\)</small>', re.IGNORECASE).findall(data)
+        if m:
+            IMDbDict['votes'] = int(m[0].replace(',', ''))
+        else:
+            IMDbDict['votes'] = -1
+
+        data = data.replace('\n',' ')
+        #some values
+        keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
+        for key in keys:
+            IMDbDict[key] = ''
+        IMDbDict['runtime'] = 0
+        soup = BeautifulSoup(data)
+        for info in soup('div', {'class': 'info'}):
+            key = unicode(info).split('</h5>')[0].split('<h5>')
+            if len(key) > 1:
+                raw_value = unicode(info).split('</h5>')[1]
+                key = key[1][:-1].lower().replace(' ', '_')
+                if key in keys:
+                    IMDbDict[key] = self.parse_raw_value(key, raw_value)
+        IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
+        #is episode
+        IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
+
+        IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
+        if IMDbDict['episodes']:
+            IMDbDict['tvshow'] = True
+        else:
+            IMDbDict['tvshow'] = False
+        IMDbDict['credits'] = self.getCredits()
+        IMDbDict['plot'] = getMoviePlot(self.imdb)
+        IMDbDict['keywords'] = getMovieKeywords(self.imdb)
+        IMDbDict['trivia'] = getMovieTrivia(self.imdb)
+        IMDbDict['connections'] = getMovieConnections(self.imdb)
+        IMDbDict['locations'] = getMovieLocations(self.imdb)
+        IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
+        IMDbDict['business'] = getMovieBusinessSum(self.imdb)
+        IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
+        IMDbDict['stills'] = getMovieStills(self.imdb)
+        #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
+        self.IMDbDict = IMDbDict
+
+        if IMDbDict['episode_of']:
+            episode_of =IMDb(IMDbDict['episode_of']).parse()
+            for key in ('country', 'language'):
+                if not IMDbDict[key]:
+                    IMDbDict[key] = episode_of[key]
+        return self.IMDbDict
+
+    def getCredits(self):
+        raw_credits = getMovieCredits(self.imdb)
+        credits = {}
+
+        def getNames(creditList):
+            return [stripTags(decodeHtml(c[0])) for c in creditList]
+
+        credits['director'] = getNames(raw_credits.get('directors', ''))
+        credits['writer'] = getNames(raw_credits.get('writers', ''))
+        credits['producer'] = getNames(raw_credits.get('producers', ''))
+        credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])]
+
+        self.credits = credits
+        return self.credits
+
+
+def guess(title, director=''):
+    #FIXME: proper file -> title
+    title = title.split('-')[0]
+    title = title.split('(')[0]
+    title = title.split('.')[0]
+    title = title.strip()
+    imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
+    return_url = ''
+
+    #lest first try google
+    #i.e. site:imdb.com Michael Stevens Sin
+    if director:
+        search = 'site:imdb.com %s "%s"' % (director, title)
+    else:
+        search = 'site:imdb.com "%s"' % title
+    for (name, url, desc) in google.find(search, 2):
+        if url.startswith('http://www.imdb.com/title/tt'):
+            return url[28:35]
+
+    try:
+        req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
+        u = urllib2.urlopen(req)
+        data = u.read()
+        return_url = u.url
+        u.close()
+    except:
+        return None
+    if return_url.startswith('http://www.imdb.com/title/tt'):
+        return return_url[28:35]
+    if data:
+        imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
+        if imdb_id:
+            return imdb_id
+
+    imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
+    req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
+    u = urllib2.urlopen(req)
+    data = u.read()
+    return_url = u.url
+    u.close()
+    if return_url.startswith('http://www.imdb.com/title/tt'):
+        return return_url[28:35]
+
+    return None
+
+def getEpisodeData(title, episode, show_url = None):
+    '''
+      Collect information about an episode.
+
+      Returns dict with title, show, description and episode
+    '''
+    episodeData = {
+        'title': u'',
+        'show': title,
+        'description': u'',
+        'episode': episode,
+    }
+    description = u''
+    if not show_url:
+        imdbid = guess(title)
+    else:
+        imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
+    if imdbid:
+        i = IMDb(imdbid).parse()
+        episodeData['title'] = i['episodes'][episode]['title']
+        episodeData['description'] = i['episodes'][episode]['description']
+        episodeData['imdb'] = i['episodes'][episode]['imdb']
+    return episodeData
+
+if __name__ == '__main__':
+    import sys
+    #print parse(sys.argv[1])
+    print "imdb:", guess(sys.argv[1])
+
--- a/oxweb/impawards.py
+++ b/oxweb/impawards.py
@ -0,0 +1,89 @@
+# vi:si:et:sw=4:sts=4:ts=4
+# encoding: utf-8
+import re
+
+from oxutils.cache import getUrlUnicode
+from oxutils.html import stripTags
+from oxutils.text import findRe
+
+import imdb
+
+
+def getMovieData(title = '', director = '', imdbId = ''):
+    data = {'posterUrls': []}
+    if not imdbId:
+        imdbId = imdb.getMovieId(title, director)
+    print imdbId
+    html = getUrlUnicode('http://impawards.com/archives/latest.html', timeout = 0)
+    pages = int(findRe(html, '<a href = page(.*?).html>'))
+    for page in range(pages + 1, 0, -1):
+        print page
+        if page <= pages:
+            html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1)
+        urls = parseArchivePage(html)
+        print urls
+        for url in urls:
+            html = getUrlUnicode(url)
+            d = parseMoviePage(html)
+            print d
+            if d['imdbId'] == imdbId:
+                data['posterUrls'].append(d['posterUrl'])
+                print d['posterUrl']
+    data['posterUrls'].sort()
+    return data
+            
+def parseArchivePage(html):
+    urls = []
+    results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
+    for result in results:
+        urls.append('http://impawards.com/%s' % result)
+    return urls
+
+def parseMoviePage(html):
+    data = {}
+    data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
+    data['title'] = stripTags(findRe(html, '<table WIDTH="400" BGCOLOR="#222222">(.*?) \(<a href="eligible.html">'))
+    data['year'] = findRe(html, '\(<a href="eligible.html">(.*?)</a>\)')
+    result = findRe(html, '<a href = (\w*?_xlg.html) target= _blank>')
+    if result:
+        url = 'http://impawards.com/%s/%s' % (data['year'], result)
+        html = getUrlUnicode(url, timeout = -1)
+        d = parsePosterPage(html, data['year'])
+        data['posterUrl'] = d['posterUrl']
+    else:
+        data['posterUrl'] = 'http://impawards.com/%s/%s' % (data['year'], findRe(html, '<td align=center><br><img SRC="(.*?)"'))
+    return data
+
+def parsePosterPage(html, year):
+    data = {}
+    data['posterUrl'] = 'http://impawards.com/%s/%s' % (year, findRe(html, '<img SRC="(.*?)"'))
+    return data
+
+def archivePosters():
+    import os
+    from oxutils.net import getUrl
+    pathname = '/Volumes/Rolux Home/Desktop/Data/impawards.com'
+    html = getUrlUnicode('http://impawards.com/archives/latest.html', timeout = 0)
+    pages = int(findRe(html, '<a href = page(.*?).html>'))
+    for page in range(pages + 1, 0, -1):
+        if page <= pages:
+            html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1)
+        urls = parseArchivePage(html)
+        print urls
+        for url in urls:
+            html = getUrlUnicode(url)
+            data = parseMoviePage(html)
+            dirname = '%s/%s/%s' % (pathname, data['imdbId'][:4], data['imdbId'])
+            filename = '%s/%s' % (dirname, os.path.split(data['posterUrl'])[1])
+            if not os.path.exists(filename):
+                jpg = getUrl(data['posterUrl'])
+                if not os.path.exists(dirname):
+                    os.makedirs(dirname)
+                f = open(filename, 'w')
+                f.write(jpg)
+                f.close()
+    
+
+if __name__ == '__main__':
+    archivePosters()
+    getMovieData('Brick', 'Rian Johnson')
--- a/oxweb/itunes.py
+++ b/oxweb/itunes.py
@ -0,0 +1,187 @@
+# vi:si:et:sw=4:sts=4:ts=4
+# encoding: utf-8
+import re
+import urllib
+
+from oxutils.cache import getUrl
+from oxutils.html import decodeHtml, stripTags
+from oxutils.text import findRe
+from oxutils.text import findString
+
+
+# to sniff itunes traffic, use something like
+# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
+
+# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit
+# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit
+
+ITUNES_HEADERS = {
+    'X-Apple-Tz': '0',
+    'X-Apple-Storefront': '143441-1',
+    'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)',
+    'Accept-Language': 'en-us, en;q=0.50',
+    'Accept-Encoding': 'gzip',
+    'Connection': 'close',
+}
+
+def composeUrl(request, parameters):
+    if request == 'advancedSearch':
+        url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
+        if parameters['media'] == 'music':
+            url += urllib.urlencode({
+              'albumTerm': parameters['title'],
+              'allArtistNames': parameters['artist'],
+              'composerTerm': '',
+              'flavor': 0,
+              'genreIndex': 1,
+              'media': 'music',
+              'mediaType': 2,
+              'ringtone': 0,
+              'searchButton': 'submit',
+              'songTerm': ''
+            })
+        elif parameters['media'] == 'movie':
+            url += urllib.urlencode({
+              'actorTerm': '',
+              'closedCaption': 0,
+              'descriptionTerm': '',
+              'directorProducerName': parameters['director'],
+              'flavor': 0,
+              'media': 'movie',
+              'mediaType': 3,
+              'movieTerm': parameters['title'],
+              'ratingIndex': 1,
+              'releaseYearTerm': '',
+              'searchButton': 'submit'
+            })
+    elif request == 'viewAlbum':
+        url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
+    elif request == 'viewMovie':
+        url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
+    return url
+
+def parseXmlDict(xml):
+    values = {}
+    strings = xml.split('<key>')
+    for string in strings:
+        if string.find('</key>') != -1:
+            key = findRe(string, '(.*?)</key>')
+            type = findRe(string, '</key><(.*?)>')
+            if type == 'true/':
+                value = True
+            else:
+                value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
+                if type == 'integer':
+                  value = int(value)
+                elif type == 'string':
+                  value = decodeHtml(value)
+            values[key] = value
+    return values
+
+def parseCast(xml, title):
+    list = []
+    try:
+        strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
+        strings.pop()
+        for string in strings:
+            list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
+        return list
+    except:
+        return list
+
+def parseMovies(xml, title):
+    list = []
+    try:
+        strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
+        strings.pop()
+        for string in strings:
+            list.append({
+              'id': findRe(string, 'viewMovie\?id=(.*?)&'),
+              'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
+            })
+        return list
+    except:
+        return list
+
+class ItunesAlbum:
+    def __init__(self, id = '', title = '', artist = ''):
+        self.id = id
+        self.title = title
+        self.artist = artist
+        if not id:
+            self.id = self.getId()
+
+    def getId(self):
+        url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
+        xml = getUrl(url, headers = ITUNES_HEADERS)
+        id = findRe(xml, 'viewAlbum\?id=(.*?)&')
+        return id
+
+    def getData(self):
+        data = {'id': self.id}
+        url = composeUrl('viewAlbum', {'id': self.id})
+        xml = getUrl(url, None, ITUNES_HEADERS)
+        data['albumName'] = findRe(xml, '<B>(.*?)</B>')
+        data['artistName'] = findRe(xml, '<b>(.*?)</b>')
+        data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
+        data['genre'] = findRe(xml, 'Genre:(.*?)<')
+        data['releaseDate'] = findRe(xml, 'Released(.*?)<')
+        data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
+        data['tracks'] = []
+        strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
+        for string in strings:
+          data['tracks'].append(parseXmlDict(string))
+        data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
+        return data
+
+class ItunesMovie:
+    def __init__(self, id = '', title = '', director = ''):
+        self.id = id
+        self.title = title
+        self.director = director
+        if not id:
+            self.id = self.getId()
+
+    def getId(self):
+        url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
+        xml = getUrl(url, headers = ITUNES_HEADERS)
+        id = findRe(xml, 'viewMovie\?id=(.*?)&')
+        return id
+
+    def getData(self):
+        data = {'id': self.id}
+        url = composeUrl('viewMovie', {'id': self.id})
+        xml = getUrl(url, None, ITUNES_HEADERS)
+        f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
+        f.write(xml)
+        f.close()
+        data['actors'] = parseCast(xml, 'actors')
+        string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
+        data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
+        data['directors'] = parseCast(xml, 'directors')
+        data['format'] = findRe(xml, 'Format:(.*?)<')
+        data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
+        data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
+        data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
+        data['producers'] = parseCast(xml, 'producers')
+        data['rated'] = findRe(xml, 'Rated(.*?)<')
+        data['relatedMovies'] = parseMovies(xml, 'related movies')
+        data['releaseDate'] = findRe(xml, 'Released(.*?)<')
+        data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
+        data['screenwriters'] = parseCast(xml, 'screenwriters')
+        data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
+        data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
+        return data
+
+if __name__ == '__main__':
+    import simplejson
+    data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
+    print simplejson.dumps(data, sort_keys = True, indent = 4)
+    data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
+    print simplejson.dumps(data, sort_keys = True, indent = 4)
+    for v in data['relatedMovies']:
+        data = ItunesMovie(id = v['id']).getData()
+        print simplejson.dumps(data, sort_keys = True, indent = 4)
+    data = ItunesMovie(id='272960052').getData()
+    print simplejson.dumps(data, sort_keys = True, indent = 4)
+
--- a/oxweb/lyricsfly.py
+++ b/oxweb/lyricsfly.py
@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from oxutils.cache import getUrl
+from oxutils.html import decodeHtml
+from oxutils.text import findRe
+
+
+def getLyrics(title, artist):
+    html = getUrl('http://lyricsfly.com/api/')
+    key = findRe(html, '<font color=green><b>(.*?)</b></font>')
+    url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
+    xml = getUrl(url)
+    lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
+    lyrics = lyrics.replace('\n', '').replace('\r', '')
+    lyrics = lyrics.replace('[br]', '\n').strip()
+    lyrics.replace('\n\n\n', '\n\n')
+    lyrics = decodeHtml(lyrics.replace('&amp;', '&'))
+    return lyrics
+
+if __name__ == '__main__':
+    print getLyrics('Election Day', 'Arcadia')
--- a/oxweb/mininova.py
+++ b/oxweb/mininova.py
@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from datetime import datetime
+import re
+import socket
+from urllib import quote
+
+from oxutils.cache import getUrl, getUrlUnicode
+from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
+from oxutils.normalize import normalizeImdbId
+import oxutils
+
+from torrent import Torrent
+
+
+def _parseResultsPage(data, max_results=10):
+    results=[]
+    regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
+    for row in  re.compile(regexp, re.DOTALL).findall(data):
+        torrentDate = row[0]
+        torrentExtra = row[1]
+        torrentId = row[2]
+        torrentTitle = decodeHtml(row[3]).strip()
+        torrentLink = "http://www.mininova.org/tor/" + torrentId
+        privateTracker = 'priv.gif' in torrentExtra
+        if not privateTracker:
+            results.append((torrentTitle, torrentLink, ''))
+    return results
+
+def findMovie(query, max_results=10):
+    '''search for torrents on mininova
+    '''
+    url = "http://www.mininova.org/search/%s/seeds" % quote(query)
+    data = getUrlUnicode(url)
+    return _parseResultsPage(data, max_results)
+
+def findMovieByImdb(imdbId):
+    '''find torrents on mininova for a given imdb id
+    '''
+    results = []
+    imdbId = normalizeImdbId(imdbId)
+    data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
+    return _parseResultsPage(data)
+
+def getId(mininovaId):
+    mininovaId = unicode(mininovaId)
+    d = findRe(mininovaId, "/(\d+)")
+    if d:
+        return d
+    mininovaId = mininovaId.split('/')
+    if len(mininovaId) == 1:
+        return mininovaId[0]
+    else:
+        return mininovaId[-1]
+
+def exists(mininovaId):
+    mininovaId = getId(mininovaId)
+    data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId)
+    if not data or 'Torrent not found...' in data:
+        return False
+    if 'tracker</a> of this torrent requires registration.' in data:
+        return False
+    return True
+
+def getData(mininovaId):
+    _key_map = {
+        'by': u'uploader',
+    }
+    mininovaId = getId(mininovaId)
+    torrent = dict()
+    torrent[u'id'] = mininovaId
+    torrent[u'domain'] = 'mininova.org'
+    torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
+    torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
+    torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
+
+    data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link'])
+    if '<h1>Torrent not found...</h1>' in data:
+        return None
+
+    for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
+        key = d[0].lower().strip()
+        key = _key_map.get(key, key)
+        value = decodeHtml(stripTags(d[1].strip()))
+        torrent[key] = value
+
+    torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
+    torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
+    torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
+    if torrent['description']:
+        torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
+    t = getUrl(torrent[u'torrent_link'])
+    torrent[u'torrent_info'] = getTorrentInfo(t)
+    return torrent
+
+class Mininova(Torrent):
+    '''
+    >>> Mininova('123')
+    {}
+    >>> Mininova('1072195')['infohash']
+    '72dfa59d2338e4a48c78cec9de25964cddb64104'
+    '''
+    def __init__(self, mininovaId):
+        self.data = getData(mininovaId)
+        if not self.data:
+            return
+        Torrent.__init__(self)
+        ratio = self.data['share ratio'].split(',')
+        self['seeder'] = -1
+        self['leecher'] = -1
+        if len(ratio) == 2:
+            val = intValue(ratio[0].replace(',','').strip())
+            if val:
+                self['seeder'] = int(val)
+            val = intValue(ratio[1].replace(',','').strip())
+            if val:
+                self['leecher'] = int(val)
+        val = intValue(self.data['downloads'].replace(',','').strip())
+        if val:
+            self['downloaded'] = int(val)
+        else:
+            self['downloaded'] = -1
+        published =  self.data['added on']
+        published = published.split(' +')[0]
+        self['published'] =  datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
+
--- a/oxweb/opensubtitles.py
+++ b/oxweb/opensubtitles.py
@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+
+import feedparser
+from oxutils.cache import getUrl, getUrlUnicode
+import oxutils
+from oxutils.lang import langCode2To3, langTo3Code
+
+def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
+    if len(language) == 2:
+        language = langCode2To3(language)
+    elif len(language) != 3:
+        language = langTo3Code(language)
+    url = "http://www.opensubtitles.org/en/search/"
+    if language:
+        url += "sublanguageid-%s/" % language
+    url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
+    data = getUrl(url)
+    if "title>opensubtitles.com - search results</title" in data:
+        fd = feedparser.parse(data)
+        opensubtitleId = None
+        if fd.entries:
+            link = fd.entries[0]['links'][0]['href']
+            opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
+            if opensubtitleId:
+                opensubtitleId = opensubtitleId[0]
+    else:
+        opensubtitleId = oxutils.findRe(data, '/en/subtitles/(.*?)/')
+    return opensubtitleId
+
+def downloadSubtitleById(opensubtitle_id):
+    srts = {}
+    data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
+    reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
+    for f in re.compile(reg_exp, re.DOTALL).findall(data):
+        name = oxutils.stripTags(f[1]).split('\n')[0]
+        url = "http://www.opensubtitles.com%s" % f[0]
+        srts[name] = getUrlUnicode(url)
+    return srts
+
--- a/oxweb/spiegel.py
+++ b/oxweb/spiegel.py
@ -0,0 +1,293 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from datetime import datetime
+import re
+import time
+
+from BeautifulSoup import BeautifulSoup
+
+import oxutils.cache
+from oxutils.html import decodeHtml, stripTags
+import oxutils.net
+
+
+def getNews(year, month, day):
+    sections = [
+        'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
+        'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
+    ]
+    dt = datetime(year, month, day)
+    day = int(dt.strftime('%j'))
+    date = dt.strftime('%d.%m.%Y')
+    news = []
+    for section in sections:
+        url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
+        if date == time.strftime('%d.%m.%Y', time.localtime()):
+            html = oxutils.net.getUrl(url)
+        else:
+            html = oxutils.cache.getUrl(url)
+        for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
+            dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
+            try:
+                description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
+            except:
+                description = ''
+            try:
+                imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
+            except:
+                imageUrl = ''
+            try:
+                title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
+            except:
+                title = ''
+            if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
+                new = {}
+                if len(dateString) == 10:
+                    new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
+                else:
+                    new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
+                # fix decodeHtml
+                # new['description'] = formatString(decodeHtml(description))
+                new['description'] = formatString(description)
+                new['imageUrl'] = imageUrl
+                new['section'] = formatSection(section)
+                new['title'] = formatString(title)
+                new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
+                if new['title1'][-1:] == ':':
+                    new['title1'] = new['title1'][0:-1]
+                new['title2'] = new['title'][len(new['title1']) + 2:]
+                new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
+                if new['url'][:1] == '/':
+                    new['url'] = 'http://www.spiegel.de' + new['url']
+                news.append(new)
+                # print '%s, %s' % (new['section'], dateString)
+            '''
+            elif dateString[:10] == date and not description:
+                print dateString + ' - no description'
+            elif dateString[:10] == date and not imageUrl:
+                print dateString + ' - no image'
+            '''
+    return news
+
+def splitTitle(title):
+    title1 = re.compile('(.*?): ').findall(title)[0]
+    title2 = re.compile(': (.*?)$').findall(title)[0]
+    return [title1, title2]
+
+def formatString(string):
+    string = string.replace('<span class="spOptiBreak"> </span>', '')
+    string = string.replace('\n', ' ').replace('  ', ' ').strip()
+    string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
+    return string
+
+def formatSection(string):
+    return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
+
+def formatSubsection(string):
+    # SPIEGEL, SPIEGEL special
+    subsection = {
+        'abi': 'Abi - und dann?',
+        'formel1': 'Formel 1',
+        'jobundberuf': 'Job & Beruf',
+        'leben': 'Leben U21',
+        'mensch': 'Mensch & Technik',
+        'sonst': '',
+        'staedte': u'St\xc3dte',
+        'ussports': 'US-Sports',
+        'wunderbar': 'wunderBAR'
+    }
+    if subsection.has_key(string):
+        return subsection[string].replace(u'\xc3', 'ae')
+    return string[:1].upper() + string[1:]
+        
+def getIssue(year, week):
+    coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
+    if not oxutils.net.exists(coverUrl):
+        return None
+    url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
+    contents = []
+    soup = BeautifulSoup(oxutils.cache.getUrl(url))
+    for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
+        item = str(item)
+        page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
+        title = stripTags(item).strip()
+        contents.append({'title': title, 'page': page})
+    pageUrl = {}
+    pages = page + 2
+    for page in range(1, pages + 10):
+        url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
+        if oxutils.cache.exists(url):
+            pageUrl[page] = url
+        else:
+            pageUrl[page] = ''
+    return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
+
+
+def archiveIssues():
+    '''
+    this is just an example of an archiving application
+    '''
+    p = {}
+    import os
+    import simplejson
+    import time
+    archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
+    localtime = time.localtime()
+    year = int(time.strftime('%Y', localtime))
+    week = int(time.strftime('%W', localtime))
+    for y in range(year, 1993, -1):
+        if y == year:
+            wMax = week + 1
+        else:
+            wMax = 53
+        for w in range(wMax, 0, -1):
+            print 'getIssue(%d, %d)' % (y, w)
+            issue = getIssue(y, w)
+            if issue:
+                dirname = '%s/%d/%02d' % (archivePath, y, w)
+                if not os.path.exists(dirname):
+                    os.makedirs(dirname)
+                filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
+                if not os.path.exists(filename):
+                    data = simplejson.dumps(issue, ensure_ascii = False)
+                    f = open(filename, 'w')
+                    f.write(data)
+                    f.close()
+                filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
+                if not os.path.exists(filename):
+                    data = []
+                    for item in issue['contents']:
+                        data.append('%3d %s' % (item['page'], item['title']))
+                    data = '\n'.join(data)
+                    f = open(filename, 'w')
+                    f.write(data)
+                    f.close()
+                filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
+                if not os.path.exists(filename):
+                    data = oxutils.cache.getUrl(issue['coverUrl'])
+                    f = open(filename, 'w')
+                    f.write(data)
+                    f.close()
+                for page in issue['pageUrl']:
+                    url = issue['pageUrl'][page]
+                    if url:
+                        filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
+                        if not os.path.exists(filename):
+                            data = oxutils.cache.getUrl(url)
+                            f = open(filename, 'w')
+                            f.write(data)
+                            f.close()
+                if not p:
+                    p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
+                else:
+                    p['num'] += 1
+                    p['sum'] += issue['pages']
+                    if issue['pages'] < p['min']:
+                        p['min'] = issue['pages']
+                    if issue['pages'] > p['max']:
+                        p['max'] = issue['pages']
+                print p['min'], p['sum'] / p['num'], p['max']
+            
+
+def archiveNews():
+    '''
+    this is just an example of an archiving application
+    '''
+    import os
+    import simplejson
+    import time
+
+    count = {}
+    colon = []
+
+    archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
+    days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+    localtime = time.localtime()
+    year = int(time.strftime('%Y', localtime))
+    month = int(time.strftime('%m', localtime))
+    day = int(time.strftime('%d', localtime)) - 1
+    for y in range(year, 1999, -1):
+        if y == year:
+            mMax = month
+        else:
+            mMax = 12
+        for m in range(mMax, 0, -1):
+            if y == year and m == month:
+                dMax = day
+            elif m == 2 and y % 4 == 0 and y % 400 != 0:
+                dMax = days[m] + 1
+            else:
+                dMax = days[m]
+            for d in range(dMax, 0, -1):
+                print 'getNews(%d, %d, %d)' % (y, m, d)
+                news = getNews(y, m ,d)
+                for new in news:
+                    dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
+                    if not os.path.exists(dirname):
+                        os.makedirs(dirname)
+                    if new['url'][-5:] == '.html':
+                        filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
+                    else:
+                        filename = dirname + '/' + new['url'] + '.json'
+                    if not os.path.exists(filename) or True:
+                        data = simplejson.dumps(new, ensure_ascii = False)
+                        f = open(filename, 'w')
+                        f.write(data)
+                        f.close()
+                    filename = filename[:-5] + '.txt'
+                    if not os.path.exists(filename) or True:
+                        data = splitTitle(new['title'])
+                        data.append(new['description'])
+                        data = '\n'.join(data)
+                        f = open(filename, 'w')
+                        f.write(data)
+                        f.close()
+                    filename = dirname + '/' + new['imageUrl'].split('/')[-1]
+                    if not os.path.exists(filename):
+                        data = oxutils.cache.getUrl(new['imageUrl'])
+                        f = open(filename, 'w')
+                        f.write(data)
+                        f.close()
+
+                    strings = new['url'].split('/')
+                    string = strings[3]
+                    if len(strings) == 6:
+                        string += '/' + strings[4]
+                    if not count.has_key(string):
+                        count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
+                    else:
+                        count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
+                    strings = splitTitle(new['title'])
+                    if strings[0] != new['title1'] or strings[1] != new['title2']:
+                        colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
+            for key in sortDictByKey(count):
+                print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
+            for value in colon:
+                print value
+
+def sortDictByKey(d):
+    keys = d.keys()
+    keys.sort()
+    return keys
+
+if __name__ == '__main__':
+    # spiegel = Spiegel(2008, 8)
+    # print spiegel.getContents()
+    # news = News(2001, 9, 10)
+    # output(news.getNews())
+    '''
+    x = []
+    for d in range(10, 30):
+        print '2/%d' % d
+        news = getNews(2008, 2, d)
+        for new in news:
+            strings = new['url'].split('/')
+            string = formatSection(strings[3])
+            if len(strings) == 6:
+                string += '/' + formatSubsection(strings[4])
+            if not string in x:
+                x.append(string)
+        print x
+    '''
+    # archiveIssues()
+    archiveNews()
--- a/oxweb/thepiratebay.py
+++ b/oxweb/thepiratebay.py
@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from datetime import datetime
+import re
+import socket
+from urllib import quote, urlencode
+from urllib2 import URLError
+
+from oxutils.cache import getUrl, getUrlUnicode
+from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
+from oxutils.normalize import normalizeImdbId
+import oxutils
+
+from torrent import Torrent
+
+
+season_episode = re.compile("S..E..", re.IGNORECASE)
+
+
+def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout):
+    headers = cache.DEFAULT_HEADERS
+    headers['Cookie'] = 'language=en_EN'
+    return cache.getUrl(url, data, headers, timeout)
+
+def _getUrlUnicode(url):
+   return cache.getUrlUnicode(url, _getUrl=_getUrl)
+
+def findMovies(query, max_results=10):
+    results = []
+    next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
+    page_count = 1
+    while next and page_count < 4:
+        page_count += 1
+        url = next[0]
+        if not url.startswith('http'):
+            if not url.startswith('/'):
+                url = "/" + url
+            url = "http://thepiratebay.org" + url
+        data = _getUrlUnicode(url)
+        regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/tor/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
+        for row in  re.compile(regexp, re.DOTALL).findall(data):
+            torrentType = row[0]
+            torrentLink = "http://thepiratebay.org" + row[1]
+            torrentTitle = decodeHtml(row[2])
+            # 201 = Movies , 202 = Movie DVDR, 205 TV Shows
+            if torrentType in ['201']:
+                results.append((torrentTitle, torrentLink, ''))
+            if len(results) >= max_results:
+                return results
+        next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
+    return results
+
+def findMovieByImdb(imdb):
+    return findMovies("tt" + normalizeImdbId(imdb))
+
+def getId(piratebayId):
+    if piratebayId.startswith('http://torrents.thepiratebay.org/'):
+        piratebayId = piratebayId.split('org/')[1]
+    d = findRe(piratebayId, "tor/(\d+)")
+    if d:
+        piratebayId = d
+    return piratebayId
+
+def exists(piratebayId):
+    piratebayId = getId(piratebayId)
+    return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId)
+
+def getData(piratebayId):
+    _key_map = {
+      'spoken language(s)': u'language',
+      'texted language(s)': u'subtitle language',
+      'by': u'uploader',
+      'leechers': 'leecher',
+      'seeders': 'seeder',
+    }
+    piratebayId = getId(piratebayId)
+    torrent = dict()
+    torrent[u'id'] = piratebayId
+    torrent[u'domain'] = 'thepiratebay.org'
+    torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
+
+    data = _getUrlUnicode(torrent['comment_link'])
+    torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
+    if not torrent[u'title']:
+        return None
+    torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
+    torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
+    title = quote(torrent['title'].encode('utf-8'))
+    torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
+    for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
+        key = d[0].lower().strip()
+        key = _key_map.get(key, key)
+        value = decodeHtml(stripTags(d[1].strip()))
+        torrent[key] = value
+    torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
+    if torrent[u'description']:
+        torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
+    t = _getUrl(torrent[u'torrent_link'])
+    torrent[u'torrent_info'] = getTorrentInfo(t)
+    return torrent
+
+class Thepiratebay(Torrent):
+    '''
+    >>> Thepiratebay('123')
+    {}
+
+    >>> Thepiratebay('3951349')['infohash']
+    '4e84415d36ed7b54066160c05a0b0f061898d12b'
+    '''
+    def __init__(self, piratebayId):
+        self.data = getData(piratebayId)
+        if not self.data:
+            return
+        Torrent.__init__(self)
+        published =  self.data['uploaded']
+        published = published.replace(' GMT', '').split(' +')[0]
+        self['published'] =  datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
+
--- a/oxweb/torrent.py
+++ b/oxweb/torrent.py
@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from oxutils import intValue
+
+
+class Torrent(dict):
+    '''
+    >>> Torrent()
+    {'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
+    '''
+    _string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link', 
+                   'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
+    _int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
+    _dict_keys = ('torrent_info', )
+    _list_keys = ()
+    data = {'torrent_info': {}}
+
+    def __init__(self):
+        for key in self._string_keys:
+            self[key] = self.data.get(key, u'')
+        for key in self._dict_keys:
+            self[key] = self.data.get(key, {})
+        for key in self._list_keys:
+            self[key] = self.data.get(key, [])
+        for key in self._int_keys:
+            value = self.data.get(key, -1)
+            if not isinstance(value, int):
+                value = int(intValue(value))
+            self[key] = value
+        self['infohash'] = self.data['torrent_info'].get('hash', '')
+        self['size'] = self.data['torrent_info'].get('size', -1)
+        self['announce'] = self.data['torrent_info'].get('announce', '')
+        if 'files' in self.data['torrent_info']:
+            self['files'] = len(self.data['torrent_info']['files'])
+        else:
+            self['files'] =  1
+
--- a/oxweb/wikipedia.py
+++ b/oxweb/wikipedia.py
@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from urllib import urlencode
+
+import simplejson
+from oxutils.cache import getUrl, getUrlUnicode
+from oxutils import findRe, decodeHtml
+
+
+def getMovieId(title, director='', year=''):
+    query = '"%s" film %s %s' % (title, director, year)
+    result = find(query, 1)
+    if result:
+        return result[0][1]
+    return ''
+
+def getUrlByImdb(imdbId):
+    query = '"imdb_id = %s"'% imdbId
+    result = find(query)
+    if result:
+        url = result[0][1]
+        return url
+    if str(imdbId).startswith('0'):
+        imdbId = imdbId[1:]
+        return getUrlByImdb(imdbId)
+
+def getUrlByAmbId(amg_id):
+    query = '"amg_id = %s"'% amg_id
+    result = find(query)
+    if result:
+        url = result[0][1]
+        return url
+    return ''
+
+def getWikiData(wikipediaUrl):
+    title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
+    url =   "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
+    html = getUrlUnicode(url)
+    data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
+    return data
+
+def getMovieData(wikipediaUrl):
+    data = getWikiData(wikipediaUrl)
+    filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
+    filmbox = {}
+    for row in filmbox_data.strip().split('|'): 
+        d = row.split('=')
+        if len(d) == 2:
+            key = d[0].strip()
+            value = d[1].strip()
+            filmbox[key] = value
+    return filmbox
+
+def getAmgId(wikipediaUrl):
+    data = getMovieData(wikipediaUrl)
+    return data.get('amg_id', '')
+
+def find(query, max_results=10):
+    query = {'action': 'query', 'list':'search', 'format': 'json',
+             'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
+    url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
+    data = getUrl(url)
+    if not data:
+        data  = getUrl(url, timeout=0)
+    result = simplejson.loads(data)
+    results = []
+    for r in result['query']['search']:
+        title = r['title']
+        url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
+        results.append((title, url, ''))
+    return results
+
--- a/oxweb/youtube.py
+++ b/oxweb/youtube.py
@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from urllib import quote
+import xml.etree.ElementTree as ET
+
+import feedparser
+from oxutils.cache import getUrl
+from oxutils import findString
+
+
+def getVideoUrl(youtubeId, format='mp4'):
+    url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId
+    data = getUrl(url)
+    xml = ET.fromstring(data)
+    youtubeKey = xml.find('t').text
+    if format == 'mp4':
+        fmt=18
+        url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt)
+    else:
+        url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey)
+    return url
+
+def getMovieInfo(youtubeId):
+    url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId
+    data = getUrl(url)
+    fd = feedparser.parse(data)
+    return getInfoFromAtom(fd.entries[0])
+
+def getInfoFromAtom(entry):
+    info = dict()
+    info['title'] = entry['title']
+    info['description'] = entry['description']
+    info['author'] = entry['author']
+    info['published'] = entry['published_parsed']
+    info['keywords'] = entry['media_keywords'].split(', ')
+    info['url'] = entry['links'][0]['href']
+    info['id'] = findString(info['url'], "/watch?v=") 
+    info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
+    info['flv'] = getVideoUrl(info['id'], 'flv')
+    info['mp4'] = getVideoUrl(info['id'], 'mp4')
+    info['embed'] = '''<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>''' % (info['id'], info['id'])
+    return info
+
+def find(query, max_results=10, offset=1, orderBy='relevance'):
+    query = quote(query)
+    url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results)
+    data = getUrl(url)
+    fd = feedparser.parse(data)
+    videos = []
+    for entry in fd.entries:
+        v = getInfoFromAtom(entry)
+        videos.append(v)
+        if len(videos) >= max_results:
+            return videos
+    return videos
+