python-oxweb/oxweb/aaaarg.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import os
import string

from oxlib import cache
from oxlib.html import stripTags, decodeHtml
from oxlib.text import findRe
from oxlib.normalize import canonicalName
import auth


def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
    headers = headers.copy()
    headers["Cookie"] = auth.get("aaaarg.cookie")
    return cache.readUrl(url, data, headers, timeout)

def readUrlUnicode(url, timeout=cache.cache_timeout):
   return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)

def downloadText(id, filename=None):
    #FIXME, what about the cache, this keeps all pdfs in oxcache...
    url='http://a.aaaarg.org/node/%d/download' % id
    data = readUrl(url, timeout=-1)
    headers = cache.getHeaders(url, timeout=-1)
    if filename:
        with open(filename, "w") as f:
            f.write(data)
        return
    return data

def getTextByLetter(letter):
    texts = []
    url = 'http://a.aaaarg.org/library/%s' % letter
    data = readUrlUnicode(url)
    txts = re.compile('<li class="author">(.*?)</li><li class="title"><a href="(.*?)">(.*?)</a></li>').findall(data)
    author = 'Unknown Author'
    for r in txts:
        if r[0] != '&nbsp;':
            author = r[0]
        link = r[1]
        id = findRe(link, '/(\d+)')
        title = decodeHtml(r[2])
        author_foder =  canonicalName(author)
        author_foder = os.path.join(author_foder[0], author_foder)
        filename = os.path.join(author_foder, '%s (aaarg %s).pdf' %  (title.replace('/', '_'), id))
        texts.append({
            'author': author,
            'title': title,
            'id': id,
            'filename': filename,
         })
    return texts

def getTexts():
    texts = []
    for letter in string.letters[:26]:
        texts += getTextByLetter(letter)
    return texts
add aaarg 2010-02-15 07:08:42 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`import re`
			`import os`
			`import string`

			`from oxlib import cache`
			`from oxlib.html import stripTags, decodeHtml`
			`from oxlib.text import findRe`
			`from oxlib.normalize import canonicalName`
			`import auth`


			`def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):`
			`headers = headers.copy()`
			`headers["Cookie"] = auth.get("aaaarg.cookie")`
			`return cache.readUrl(url, data, headers, timeout)`

			`def readUrlUnicode(url, timeout=cache.cache_timeout):`
			`return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)`

			`def downloadText(id, filename=None):`
			`#FIXME, what about the cache, this keeps all pdfs in oxcache...`
			`url='http://a.aaaarg.org/node/%d/download' % id`
			`data = readUrl(url, timeout=-1)`
			`headers = cache.getHeaders(url, timeout=-1)`
			`if filename:`
			`with open(filename, "w") as f:`
			`f.write(data)`
			`return`
			`return data`

			`def getTextByLetter(letter):`
			`texts = []`
			`url = 'http://a.aaaarg.org/library/%s' % letter`
			`data = readUrlUnicode(url)`
			`txts = re.compile('<li class="author">(.?)</li><li class="title"><a href="(.?)">(.*?)</a></li>').findall(data)`
			`author = 'Unknown Author'`
			`for r in txts:`
			`if r[0] != ' ':`
			`author = r[0]`
			`link = r[1]`
			`id = findRe(link, '/(\d+)')`
			`title = decodeHtml(r[2])`
			`author_foder = canonicalName(author)`
			`author_foder = os.path.join(author_foder[0], author_foder)`
no / in filename 2010-02-27 11:48:49 +00:00			`filename = os.path.join(author_foder, '%s (aaarg %s).pdf' % (title.replace('/', '_'), id))`
add aaarg 2010-02-15 07:08:42 +00:00			`texts.append({`
			`'author': author,`
			`'title': title,`
			`'id': id,`
			`'filename': filename,`
			`})`
			`return texts`

			`def getTexts():`
			`texts = []`
			`for letter in string.letters[:26]:`
			`texts += getTextByLetter(letter)`
			`return texts`