# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re import os import string from ox import cache from ox.html import stripTags, decodeHtml from ox.text import findRe from ox.normalize import canonicalName import auth def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None): headers = headers.copy() headers["Cookie"] = auth.get("aaaarg.cookie") return cache.readUrl(url, data, headers, timeout) def readUrlUnicode(url, timeout=cache.cache_timeout): return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout) def downloadText(id, filename=None): #FIXME, what about the cache, this keeps all pdfs in oxcache... url='http://a.aaaarg.org/node/%d/download' % id data = readUrl(url, timeout=-1) headers = cache.getHeaders(url, timeout=-1) if filename: with open(filename, "w") as f: f.write(data) return return data def getTextByLetter(letter): texts = [] url = 'http://a.aaaarg.org/library/%s' % letter data = readUrlUnicode(url) txts = re.compile('
  • (.*?)
  • (.*?)
  • ').findall(data) author = 'Unknown Author' for r in txts: if r[0] != ' ': author = r[0] link = r[1] id = findRe(link, '/(\d+)') title = decodeHtml(r[2]) author_foder = canonicalName(author) author_foder = os.path.join(author_foder[0], author_foder) filename = os.path.join(author_foder, '%s (aaarg %s).pdf' % (title.replace('/', '_'), id)) texts.append({ 'author': author, 'title': title, 'id': id, 'filename': filename, }) return texts def getTexts(): texts = [] for letter in string.letters[:26]: texts += getTextByLetter(letter) return texts