diff --git a/oxweb/aaaarg.py b/oxweb/aaaarg.py new file mode 100644 index 0000000..cca6385 --- /dev/null +++ b/oxweb/aaaarg.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +import os +import string + +from oxlib import cache +from oxlib.html import stripTags, decodeHtml +from oxlib.text import findRe +from oxlib.normalize import canonicalName +import auth + + +def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None): + headers = headers.copy() + headers["Cookie"] = auth.get("aaaarg.cookie") + return cache.readUrl(url, data, headers, timeout) + +def readUrlUnicode(url, timeout=cache.cache_timeout): + return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout) + +def downloadText(id, filename=None): + #FIXME, what about the cache, this keeps all pdfs in oxcache... + url='http://a.aaaarg.org/node/%d/download' % id + data = readUrl(url, timeout=-1) + headers = cache.getHeaders(url, timeout=-1) + if filename: + with open(filename, "w") as f: + f.write(data) + return + return data + +def getTextByLetter(letter): + texts = [] + url = 'http://a.aaaarg.org/library/%s' % letter + data = readUrlUnicode(url) + txts = re.compile('
  • (.*?)
  • (.*?)
  • ').findall(data) + author = 'Unknown Author' + for r in txts: + if r[0] != ' ': + author = r[0] + link = r[1] + id = findRe(link, '/(\d+)') + title = decodeHtml(r[2]) + author_foder = canonicalName(author) + author_foder = os.path.join(author_foder[0], author_foder) + filename = os.path.join(author_foder, '%s (aaarg %s).pdf' % (title, id)) + texts.append({ + 'author': author, + 'title': title, + 'id': id, + 'filename': filename, + }) + return texts + +def getTexts(): + texts = [] + for letter in string.letters[:26]: + texts += getTextByLetter(letter) + return texts + diff --git a/oxweb/karagarga.py b/oxweb/karagarga.py index 8f49656..e976d47 100644 --- a/oxweb/karagarga.py +++ b/oxweb/karagarga.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 import re from oxlib import cache from oxlib.html import stripTags