# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re import lxml.html import ox from ox.cache import read_url, cache_timeout def get_id(url): if url.startswith('http://gutenberg.spiegel.de/buch/'): return url.split('/')[-2] def get_url(id): return 'http://gutenberg.spiegel.de/buch/%s/1' % id def get_page(url): data = ox.cache.read_url(url) doc = lxml.html.document_fromstring(data) gutenb = doc.body.get_element_by_id('gutenb') html = lxml.html.tostring(gutenb) return html.replace(' id="gutenb"', '') def get_images(page, html, b=False): img = [] base = '' if '.*?') for i in re.compile('(.*?)').findall(info): key, value = i.split('') data[ox.strip_tags(key)] = ox.strip_tags(value) links = re.compile('').findall(html) while links: for l in links: l = 'http://gutenberg.spiegel.de' + l html = ox.cache.read_url(l) links = re.compile('').findall(html) page = get_page(l) pages.append(page) data['images'] += get_images(page, html) data['pages'] = pages return data def get_authors(): url = "http://gutenberg.spiegel.de/autor" data = ox.cache.read_url(url, unicode=True) authors = {} for l,a in re.compile('(.*?)').findall(data): authors[l] = a return authors def get_books(author_id=None): books = {} if not author_id: url = "http://gutenberg.spiegel.de/buch" data = ox.cache.read_url(url, unicode=True) for l, t in re.compile('(.*?)').findall(data): l = l.split('/')[-2] books[l] = t else: url = "http://gutenberg.spiegel.de/autor/%s" % author_id data = ox.cache.read_url(url, unicode=True) for l,t in re.compile('(.*?)').findall(data): while l.startswith('../'): l = l[3:] l = 'http://gutenberg.spiegel.de/' + l books[l] = t return books def get_ids(): return get_books().keys()