add aaarg
This commit is contained in:
parent
9899cf89a1
commit
686bf450e4
2 changed files with 63 additions and 0 deletions
61
oxweb/aaaarg.py
Normal file
61
oxweb/aaaarg.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import string
|
||||||
|
|
||||||
|
from oxlib import cache
|
||||||
|
from oxlib.html import stripTags, decodeHtml
|
||||||
|
from oxlib.text import findRe
|
||||||
|
from oxlib.normalize import canonicalName
|
||||||
|
import auth
|
||||||
|
|
||||||
|
|
||||||
|
def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
|
||||||
|
headers = headers.copy()
|
||||||
|
headers["Cookie"] = auth.get("aaaarg.cookie")
|
||||||
|
return cache.readUrl(url, data, headers, timeout)
|
||||||
|
|
||||||
|
def readUrlUnicode(url, timeout=cache.cache_timeout):
|
||||||
|
return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
|
||||||
|
|
||||||
|
def downloadText(id, filename=None):
|
||||||
|
#FIXME, what about the cache, this keeps all pdfs in oxcache...
|
||||||
|
url='http://a.aaaarg.org/node/%d/download' % id
|
||||||
|
data = readUrl(url, timeout=-1)
|
||||||
|
headers = cache.getHeaders(url, timeout=-1)
|
||||||
|
if filename:
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
f.write(data)
|
||||||
|
return
|
||||||
|
return data
|
||||||
|
|
||||||
|
def getTextByLetter(letter):
|
||||||
|
texts = []
|
||||||
|
url = 'http://a.aaaarg.org/library/%s' % letter
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
txts = re.compile('<li class="author">(.*?)</li><li class="title"><a href="(.*?)">(.*?)</a></li>').findall(data)
|
||||||
|
author = 'Unknown Author'
|
||||||
|
for r in txts:
|
||||||
|
if r[0] != ' ':
|
||||||
|
author = r[0]
|
||||||
|
link = r[1]
|
||||||
|
id = findRe(link, '/(\d+)')
|
||||||
|
title = decodeHtml(r[2])
|
||||||
|
author_foder = canonicalName(author)
|
||||||
|
author_foder = os.path.join(author_foder[0], author_foder)
|
||||||
|
filename = os.path.join(author_foder, '%s (aaarg %s).pdf' % (title, id))
|
||||||
|
texts.append({
|
||||||
|
'author': author,
|
||||||
|
'title': title,
|
||||||
|
'id': id,
|
||||||
|
'filename': filename,
|
||||||
|
})
|
||||||
|
return texts
|
||||||
|
|
||||||
|
def getTexts():
|
||||||
|
texts = []
|
||||||
|
for letter in string.letters[:26]:
|
||||||
|
texts += getTextByLetter(letter)
|
||||||
|
return texts
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
from oxlib import cache
|
from oxlib import cache
|
||||||
from oxlib.html import stripTags
|
from oxlib.html import stripTags
|
||||||
|
|
Loading…
Reference in a new issue