# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import lxml.html
import ox
from ox.cache import read_url, cache_timeout
def get_id(url):
if url.startswith('http://gutenberg.spiegel.de/buch/'):
return url.split('/')[-2]
def get_url(id):
return 'http://gutenberg.spiegel.de/buch/%s/1' % id
def get_page(url):
data = ox.cache.read_url(url)
doc = lxml.html.document_fromstring(data)
gutenb = doc.body.get_element_by_id('gutenb')
html = lxml.html.tostring(gutenb)
return html.replace(' id="gutenb"', '')
def get_images(page, html, b=False):
img = []
base = ''
if '.*?')
for i in re.compile('
(.*?)').findall(info):
key, value = i.split('')
data[ox.strip_tags(key)] = ox.strip_tags(value)
links = re.compile('').findall(html)
while links:
for l in links:
l = 'http://gutenberg.spiegel.de' + l
html = ox.cache.read_url(l)
links = re.compile('').findall(html)
page = get_page(l)
pages.append(page)
data['images'] += get_images(page, html)
data['pages'] = pages
return data
def get_authors():
url = "http://gutenberg.spiegel.de/autor"
data = ox.cache.read_url(url, unicode=True)
authors = {}
for l,a in re.compile('(.*?)').findall(data):
authors[l] = a
return authors
def get_books(author_id=None):
books = {}
if not author_id:
url = "http://gutenberg.spiegel.de/buch"
data = ox.cache.read_url(url, unicode=True)
for l, t in re.compile('(.*?)').findall(data):
l = l.split('/')[-2]
books[l] = t
else:
url = "http://gutenberg.spiegel.de/autor/%s" % author_id
data = ox.cache.read_url(url, unicode=True)
for l,t in re.compile('(.*?)').findall(data):
while l.startswith('../'):
l = l[3:]
l = 'http://gutenberg.spiegel.de/' + l
books[l] = t
return books
def get_ids():
return get_books().keys()
|