cleanup

2010-08-24 19:08:03 +02:00 · 2010-08-24 19:08:03 +02:00 · ef20041185
commit ef20041185
parent 942cb819c7
4 changed files with 26 additions and 199 deletions
--- a/oxweb/aaaarg.py
+++ b/oxweb/aaaarg.py
@ -1,61 +0,0 @@
-# -*- coding: utf-8 -*-
-# vi:si:et:sw=4:sts=4:ts=4
-import re
-import os
-import string
-
-from oxlib import cache
-from oxlib.html import stripTags, decodeHtml
-from oxlib.text import findRe
-from oxlib.normalize import canonicalName
-import auth
-
-
-def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
-    headers = headers.copy()
-    headers["Cookie"] = auth.get("aaaarg.cookie")
-    return cache.readUrl(url, data, headers, timeout)
-
-def readUrlUnicode(url, timeout=cache.cache_timeout):
-   return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
-
-def downloadText(id, filename=None):
-    #FIXME, what about the cache, this keeps all pdfs in oxcache...
-    url='http://a.aaaarg.org/node/%d/download' % id
-    data = readUrl(url, timeout=-1)
-    headers = cache.getHeaders(url, timeout=-1)
-    if filename:
-        with open(filename, "w") as f:
-            f.write(data)
-        return
-    return data
-
-def getTextByLetter(letter):
-    texts = []
-    url = 'http://a.aaaarg.org/library/%s' % letter
-    data = readUrlUnicode(url)
-    txts = re.compile('<li class="author">(.*?)</li><li class="title"><a href="(.*?)">(.*?)</a></li>').findall(data)
-    author = 'Unknown Author'
-    for r in txts:
-        if r[0] != '&nbsp;':
-            author = r[0]
-        link = r[1]
-        id = findRe(link, '/(\d+)')
-        title = decodeHtml(r[2])
-        author_foder =  canonicalName(author)
-        author_foder = os.path.join(author_foder[0], author_foder)
-        filename = os.path.join(author_foder, '%s (aaarg %s).pdf' %  (title.replace('/', '_'), id))
-        texts.append({
-            'author': author,
-            'title': title,
-            'id': id,
-            'filename': filename,
-         })
-    return texts
-
-def getTexts():
-    texts = []
-    for letter in string.letters[:26]:
-        texts += getTextByLetter(letter)
-    return texts
-
--- a/oxweb/auth.py
+++ b/oxweb/auth.py
@ -2,7 +2,7 @@
 # vi:si:et:sw=4:sts=4:ts=4
 # GPL 2009
 import os
-import simplejson
+import simplejson as json


 def get(key):
@ -12,9 +12,22 @@ def get(key):
        f = open(user_auth, "r")
        data = f.read()
        f.close()
-        auth = simplejson.loads(data)
+        auth = json.loads(data)
    if key in auth:
        return auth[key]
    print "please add key %s to json file '%s'" % (key, user_auth)
    return ""

+def update(key, value):
+    user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
+    auth = {}
+    if os.path.exists(user_auth):
+        f = open(user_auth, "r")
+        data = f.read()
+        f.close()
+        auth = json.loads(data)
+    auth[key] = value
+    f = open(user_auth, "w")
+    f.write(json.dumps(auth, indent=2))
+    f.close()
+    
--- a/oxweb/imdb.py
+++ b/oxweb/imdb.py
@ -368,18 +368,21 @@ def getMoviePosters(imdbId):
  
 def getMovieTrivia(imdbId):
    url = "%strivia" % getUrlBase(imdbId)
-    data = readUrlUnicode(url)
-    data = findRe(data, '<ul class="trivia">(.*?)</ul>')
+    data_ = readUrlUnicode(url)
+    data = findRe(data_, '<ul class="trivia">(.*?)</ul>')
    trivia = re.compile('<li>(.*?)</li>', re.DOTALL).findall(data)
    def clean(t):
        t = decodeHtml(t)
-        t = t.replace(u'', '"')
-        if t.endswith('<br><br>'):
-            t = t[:-8]
-        if t.endswith('<br>\n<br>'):
-            t = t[:-len('<br>\n<br>')]
+        t = t.replace(u'', '"').strip()
+        for s in ('<br><br>', '<br>\n<br>', '<br>'):
+            if t.endswith(s):
+                t = t[:-len(s)].strip()
        return t.strip()
+    if len(trivia) == 0:
+        trivia = re.compile('<div class="sodatext">(.*?)<span', re.DOTALL).findall(data_)
+
    trivia = [clean(t) for t in trivia]
+
    return trivia

 def getMovieConnections(imdbId):
--- a/oxweb/karagarga.py
+++ b/oxweb/karagarga.py
@ -1,128 +0,0 @@
-# -*- coding: utf-8 -*-
-# vi:si:et:sw=4:sts=4:ts=4
-import re
-from oxlib import cache
-from oxlib.html import stripTags
-from oxlib.text import findRe
-
-import auth
-
-
-def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
-    headers = headers.copy()
-    headers["Cookie"] = auth.get("karagarga.cookie")
-    return cache.readUrl(url, data, headers, timeout)
-
-def readUrlUnicode(url, timeout=cache.cache_timeout):
-   return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
-
-def getData(id):
-    data = {
-        "url": getUrl(id)
-    }
-    html = readUrlUnicode("%s%s" % (data["url"], "&filelist=1"))
-    if 'No torrent with ID' in html:
-        return False
-    data['added'] = stripTags(parseTable(html, 'Added'))
-    data['country'] = findRe(html, 'title="([\w ]*?)" border="0" width="32" height="20"')
-    # data['description'] = parseTable(html, 'Description')
-    data['director'] = stripTags(parseTable(html, 'Director / Artist'))
-    data['files'] = []
-    result = findRe(html, '<table class=main border="1" cellspacing=0 cellpadding="5">(.*?)</table>')
-    results = re.compile('<td>(.*?)</td><td align="right">(.*?)</td>', re.DOTALL).findall(result)
-    for name, size in results:
-        data['files'].append({
-            'name': name,
-            'size': '%s %s' % (size[:-2], size[-2:].strip().upper())
-        })
-    data['format'] = ''
-    if html.find('genreimages/dvdr.png') != -1:
-        data['format'] = 'DVD'
-    elif html.find('genreimages/hdrip.png') != -1:
-        data['format'] = 'HD'
-    data['genre'] = []
-    result = parseTable(html, 'Genres')
-    for string in result.split('\n'):
-        string = stripTags(findRe(string, '<a href="browse.php\?genre=.*?">(.*?)</a>'))
-        if string:
-            data['genre'].append(string)
-    data['id'] = id
-    data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
-    data['language'] = stripTags(parseTable(html, 'Language'))
-    data['leechers'] = int(findRe(html, 'seeder\(s\), (.*?) leecher\(s\)'))
-    data['link'] = stripTags(parseTable(html, 'Internet Link'))
-    data['links'] = []
-    results = re.compile('<a href="(.*?)">(.*?)</a>', re.DOTALL).findall(parseTable(html, 'Description'))
-    for (url, title) in results:
-        if url.find('javascript') == -1:
-            data['links'].append({
-                'title': title,
-                'url': url.replace('http://anonym.to/?', '')
-            })
-    data['people'] = 0
-    result = stripTags(findRe(html, '<a href="top10others.php.*?>(.*?) people')).strip()
-    if result:
-        data['people'] = int(result)
-    data['posters'] = []
-    results = re.compile('<img border=0 src="(http://.*?)"', re.DOTALL).findall(html)
-    for result in results:
-        data['posters'].append(result)
-    data['seeders'] = int(findRe(html, '#seeders" class="sublink".*?colspan=2>(.*?) seeder\(s\)'))
-    data['size'] = int(findRe(parseTable(html, 'Size'), '\((.*?) ').replace(',', ''))
-    data['snatched'] = int(findRe(html, '<a name="snatchers">.*?colspan=2>(.*?) '))
-    data['subtitle'] = findRe(parseTable(html, 'Subtitles'), '>(.*?)<hr>').replace('included: ', '')
-    data['subtitles'] = []
-    results = re.compile('<a href="(.*?)">(.*?)</a>', re.DOTALL).findall(parseTable(html, 'Subtitles'))
-    for (url, language) in results:
-        data['subtitles'].append({
-            'language': language.replace('click here for ', ''),
-            'url': url
-        })
-    data['torrent'] = 'http://karagarga.net/%s' % findRe(html, '(down.php/.*?)"')
-    data['year'] = stripTags(parseTable(html, 'Year'))
-    data['title'] = stripTags(findRe(html, '<h1>(.*?)</h1>')).strip()
-    data['title'] = re.sub('^%s - ' % re.escape(data['director']), '', data['title'])
-    data['title'] = re.sub(' \(%s\)$' % re.escape(data['year']), '', data['title'])    
-    return data
-
-def getId(url):
-    return url.split("=")[-1]
-
-def getTorrent(id):
-    return readUrl(getData(id)['torrent'])
-
-def getIds(lastId = 20):
-    lastId = '%s' % lastId
-    ids = []
-    page = 0
-    while True:
-        for id in getIdsByPage(page):
-            if not id in ids:
-                ids.append(id)
-        if lastId in ids:
-            break
-        page += 1
-    return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
-
-def getIdsByPage(page):
-    ids = []
-    url = 'http://karagarga.net/browse.php?page=%s&cat=1&sort=added&d=DESC' % page
-    html = readUrlUnicode(url, timeout = 23*60*60) #get new ids once per day
-    strings = html.split('<td width="42" style="padding:0px;">')
-    strings.pop(0)
-    for string in strings:
-        ids.append(findRe(string, '"details.php\?id=(.*?)"'))
-    return ids
-
-def getUrl(id):
-    return "http://karagarga.net/details.php?id=%s" % id
-
-def parseTable(html, title):
-    if title == 'Genres':
-        return findRe(html, '<td class="heading" [\w=" ]*?>%s</td>(.*?)</table>' % title)
-    else:
-        return findRe(html, '<td class="heading" [\w=" ]*?>%s</td>(.*?)</td>' % title)
-
-if __name__ == "__main__":
-    print getIds("79317")
-    print getData("79317")