openmedialibrary_platform/Shared/lib/python2.7/site-packages/ox/web/gutenbergde.py
2014-05-16 01:20:41 +02:00

92 lines
2.8 KiB
Python

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import lxml.html
import ox
from ox.cache import read_url, cache_timeout
def get_id(url):
if url.startswith('http://gutenberg.spiegel.de/buch/'):
return url.split('/')[-2]
def get_url(id):
return 'http://gutenberg.spiegel.de/buch/%s/1' % id
def get_page(url):
data = ox.cache.read_url(url)
doc = lxml.html.document_fromstring(data)
gutenb = doc.body.get_element_by_id('gutenb')
html = lxml.html.tostring(gutenb)
return html.replace(' id="gutenb"', '')
def get_images(page, html, b=False):
img = []
base = ''
if '<img' in page:
base = ox.find_re(html, '<base href="(.*?)"')
for url in re.compile('<img.*?src="(.*?)"').findall(page):
url = base + url
img.append(url)
if b:
return base, img
return img
def get_book(id):
if isinstance(id, basestring) and id.startswith('http'):
url = id
else:
url = get_url(id)
html = ox.cache.read_url(url, unicode=True)
data = {}
data['url'] = url
pages = []
page = get_page(url)
pages.append(page)
data['base'], data['images'] = get_images(page, html, True)
info = ox.find_re(html, '<table>.*?</table>')
for i in re.compile('<tr.*?>(.*?)</tr>').findall(info):
key, value = i.split('</td><td>')
data[ox.strip_tags(key)] = ox.strip_tags(value)
links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
while links:
for l in links:
l = 'http://gutenberg.spiegel.de' + l
html = ox.cache.read_url(l)
links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
page = get_page(l)
pages.append(page)
data['images'] += get_images(page, html)
data['pages'] = pages
return data
def get_authors():
url = "http://gutenberg.spiegel.de/autor"
data = ox.cache.read_url(url, unicode=True)
authors = {}
for l,a in re.compile('<a href="/autor/(\d+)">(.*?)</a>').findall(data):
authors[l] = a
return authors
def get_books(author_id=None):
books = {}
if not author_id:
url = "http://gutenberg.spiegel.de/buch"
data = ox.cache.read_url(url, unicode=True)
for l, t in re.compile('<a href="(/buch.+?)">(.*?)</a>').findall(data):
l = l.split('/')[-2]
books[l] = t
else:
url = "http://gutenberg.spiegel.de/autor/%s" % author_id
data = ox.cache.read_url(url, unicode=True)
for l,t in re.compile('<a href="(.+.xml)">(.*?)</a>').findall(data):
while l.startswith('../'):
l = l[3:]
l = 'http://gutenberg.spiegel.de/' + l
books[l] = t
return books
def get_ids():
return get_books().keys()