diff --git a/ox/web/aaaarg.py b/ox/web/aaaarg.py index 43f9d55..77c193c 100644 --- a/ox/web/aaaarg.py +++ b/ox/web/aaaarg.py @@ -21,7 +21,7 @@ def readUrlUnicode(url, timeout=cache.cache_timeout): def downloadText(id, filename=None): #FIXME, what about the cache, this keeps all pdfs in oxcache... - url='http://a.aaaarg.org/node/%d/download' % id + url='http://aaaaarg.org/node/%d/download' % id data = readUrl(url, timeout=-1) headers = cache.getHeaders(url, timeout=-1) if filename: @@ -32,7 +32,7 @@ def downloadText(id, filename=None): def getTextByLetter(letter): texts = [] - url = 'http://a.aaaarg.org/library/%s' % letter + url = 'http://aaaaarg.org/library/%s' % letter data = readUrlUnicode(url) txts = re.compile('
  • (.*?)
  • (.*?)
  • ').findall(data) author = 'Unknown Author' @@ -53,9 +53,25 @@ def getTextByLetter(letter): }) return texts +def getData(id): + url = "http://aaaaarg.org/node/%s"%id + data=readUrlUnicode(url) + + title = findRe(data, '

    (.*?)

    ') + author = findRe(data, '
    written by (.*?)
    ') + links = re.compile('').findall(data) + + return { + 'aaaaarg': id, + 'links': links, + 'title': title, + 'author': author + } + def getTexts(): texts = [] for letter in string.letters[:26]: texts += getTextByLetter(letter) + texts += getTextByLetter('date') return texts