92 lines
2.8 KiB
Python
92 lines
2.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
import re
|
|
|
|
import lxml.html
|
|
|
|
import ox
|
|
from ox.cache import read_url, cache_timeout
|
|
|
|
|
|
def get_id(url):
|
|
if url.startswith('http://gutenberg.spiegel.de/buch/'):
|
|
return url.split('/')[-2]
|
|
|
|
def get_url(id):
|
|
return 'http://gutenberg.spiegel.de/buch/%s/1' % id
|
|
|
|
def get_page(url):
|
|
data = ox.cache.read_url(url)
|
|
doc = lxml.html.document_fromstring(data)
|
|
gutenb = doc.body.get_element_by_id('gutenb')
|
|
html = lxml.html.tostring(gutenb)
|
|
return html.replace(' id="gutenb"', '')
|
|
|
|
def get_images(page, html, b=False):
|
|
img = []
|
|
base = ''
|
|
if '<img' in page:
|
|
base = ox.find_re(html, '<base href="(.*?)"')
|
|
for url in re.compile('<img.*?src="(.*?)"').findall(page):
|
|
url = base + url
|
|
img.append(url)
|
|
if b:
|
|
return base, img
|
|
return img
|
|
|
|
def get_book(id):
|
|
if isinstance(id, basestring) and id.startswith('http'):
|
|
url = id
|
|
else:
|
|
url = get_url(id)
|
|
html = ox.cache.read_url(url, unicode=True)
|
|
data = {}
|
|
data['url'] = url
|
|
pages = []
|
|
page = get_page(url)
|
|
pages.append(page)
|
|
data['base'], data['images'] = get_images(page, html, True)
|
|
info = ox.find_re(html, '<table>.*?</table>')
|
|
for i in re.compile('<tr.*?>(.*?)</tr>').findall(info):
|
|
key, value = i.split('</td><td>')
|
|
data[ox.strip_tags(key)] = ox.strip_tags(value)
|
|
links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
|
|
while links:
|
|
for l in links:
|
|
l = 'http://gutenberg.spiegel.de' + l
|
|
html = ox.cache.read_url(l)
|
|
links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
|
|
page = get_page(l)
|
|
pages.append(page)
|
|
data['images'] += get_images(page, html)
|
|
data['pages'] = pages
|
|
return data
|
|
|
|
def get_authors():
|
|
url = "http://gutenberg.spiegel.de/autor"
|
|
data = ox.cache.read_url(url, unicode=True)
|
|
authors = {}
|
|
for l,a in re.compile('<a href="/autor/(\d+)">(.*?)</a>').findall(data):
|
|
authors[l] = a
|
|
return authors
|
|
|
|
def get_books(author_id=None):
|
|
books = {}
|
|
if not author_id:
|
|
url = "http://gutenberg.spiegel.de/buch"
|
|
data = ox.cache.read_url(url, unicode=True)
|
|
for l, t in re.compile('<a href="(/buch.+?)">(.*?)</a>').findall(data):
|
|
l = l.split('/')[-2]
|
|
books[l] = t
|
|
else:
|
|
url = "http://gutenberg.spiegel.de/autor/%s" % author_id
|
|
data = ox.cache.read_url(url, unicode=True)
|
|
for l,t in re.compile('<a href="(.+.xml)">(.*?)</a>').findall(data):
|
|
while l.startswith('../'):
|
|
l = l[3:]
|
|
l = 'http://gutenberg.spiegel.de/' + l
|
|
books[l] = t
|
|
return books
|
|
|
|
def get_ids():
|
|
return get_books().keys()
|