add aaarg

2010-02-15 12:38:42 +05:30 · 2010-02-15 12:38:42 +05:30 · 686bf450e4
commit 686bf450e4
parent 9899cf89a1
2 changed files with 63 additions and 0 deletions
--- a/oxweb/aaaarg.py
+++ b/oxweb/aaaarg.py
@ -0,0 +1,61 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 import re
 import os
 import string
 from oxlib import cache
 from oxlib.html import stripTags, decodeHtml
 from oxlib.text import findRe
 from oxlib.normalize import canonicalName
 import auth
 def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
    headers = headers.copy()
    headers["Cookie"] = auth.get("aaaarg.cookie")
    return cache.readUrl(url, data, headers, timeout)
 def readUrlUnicode(url, timeout=cache.cache_timeout):
   return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
 def downloadText(id, filename=None):
    #FIXME, what about the cache, this keeps all pdfs in oxcache...
    url='http://a.aaaarg.org/node/%d/download' % id
    data = readUrl(url, timeout=-1)
    headers = cache.getHeaders(url, timeout=-1)
    if filename:
        with open(filename, "w") as f:
            f.write(data)
        return
    return data
 def getTextByLetter(letter):
    texts = []
    url = 'http://a.aaaarg.org/library/%s' % letter
    data = readUrlUnicode(url)
    txts = re.compile('<li class="author">(.*?)</li><li class="title"><a href="(.*?)">(.*?)</a></li>').findall(data)
    author = 'Unknown Author'
    for r in txts:
        if r[0] != '&nbsp;':
            author = r[0]
        link = r[1]
        id = findRe(link, '/(\d+)')
        title = decodeHtml(r[2])
        author_foder =  canonicalName(author)
        author_foder = os.path.join(author_foder[0], author_foder)
        filename = os.path.join(author_foder, '%s (aaarg %s).pdf' %  (title, id))
        texts.append({
            'author': author,
            'title': title,
            'id': id,
            'filename': filename,
         })
    return texts
 def getTexts():
    texts = []
    for letter in string.letters[:26]:
        texts += getTextByLetter(letter)
    return texts
--- a/oxweb/karagarga.py
+++ b/oxweb/karagarga.py
@ -1,3 +1,5 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 import re
 from oxlib import cache
 from oxlib.html import stripTags