# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re import os import string from ox import cache from ox.html import stripTags, decodeHtml from ox.text import findRe from ox.normalize import canonicalName import auth def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None): headers = headers.copy() headers["Cookie"] = auth.get("aaaarg.cookie") return cache.readUrl(url, data, headers, timeout) def readUrlUnicode(url, timeout=cache.cache_timeout): return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout) def downloadText(id, filename=None): #FIXME, what about the cache, this keeps all pdfs in oxcache... url='http://a.aaaarg.org/node/%d/download' % id data = readUrl(url, timeout=-1) headers = cache.getHeaders(url, timeout=-1) if filename: with open(filename, "w") as f: f.write(data) return return data def getTextByLetter(letter): texts = [] url = 'http://a.aaaarg.org/library/%s' % letter data = readUrlUnicode(url) txts = re.compile('