From 01c368d493ba803bcc42009651c860595d7d329b Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Fri, 19 Nov 2010 19:57:55 +0100 Subject: [PATCH] cleanup --- ox/web/aaaarg.py | 77 ------------------------------------------------ 1 file changed, 77 deletions(-) delete mode 100644 ox/web/aaaarg.py diff --git a/ox/web/aaaarg.py b/ox/web/aaaarg.py deleted file mode 100644 index 77c193c..0000000 --- a/ox/web/aaaarg.py +++ /dev/null @@ -1,77 +0,0 @@ -# -*- coding: utf-8 -*- -# vi:si:et:sw=4:sts=4:ts=4 -import re -import os -import string - -from ox import cache -from ox.html import stripTags, decodeHtml -from ox.text import findRe -from ox.normalize import canonicalName -import auth - - -def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None): - headers = headers.copy() - headers["Cookie"] = auth.get("aaaarg.cookie") - return cache.readUrl(url, data, headers, timeout) - -def readUrlUnicode(url, timeout=cache.cache_timeout): - return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout) - -def downloadText(id, filename=None): - #FIXME, what about the cache, this keeps all pdfs in oxcache... - url='http://aaaaarg.org/node/%d/download' % id - data = readUrl(url, timeout=-1) - headers = cache.getHeaders(url, timeout=-1) - if filename: - with open(filename, "w") as f: - f.write(data) - return - return data - -def getTextByLetter(letter): - texts = [] - url = 'http://aaaaarg.org/library/%s' % letter - data = readUrlUnicode(url) - txts = re.compile('
  • (.*?)
  • (.*?)
  • ').findall(data) - author = 'Unknown Author' - for r in txts: - if r[0] != ' ': - author = r[0] - link = r[1] - id = findRe(link, '/(\d+)') - title = decodeHtml(r[2]) - author_foder = canonicalName(author) - author_foder = os.path.join(author_foder[0], author_foder) - filename = os.path.join(author_foder, '%s (aaarg %s).pdf' % (title.replace('/', '_'), id)) - texts.append({ - 'author': author, - 'title': title, - 'id': id, - 'filename': filename, - }) - return texts - -def getData(id): - url = "http://aaaaarg.org/node/%s"%id - data=readUrlUnicode(url) - - title = findRe(data, '

    (.*?)

    ') - author = findRe(data, '
    written by (.*?)
    ') - links = re.compile('').findall(data) - - return { - 'aaaaarg': id, - 'links': links, - 'title': title, - 'author': author - } - -def getTexts(): - texts = [] - for letter in string.letters[:26]: - texts += getTextByLetter(letter) - texts += getTextByLetter('date') - return texts -