openmedialibrary_platform/Shared/lib/python2.7/site-packages/ox/web/gutenbergde.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re

import lxml.html

import ox
from ox.cache import read_url, cache_timeout


def get_id(url):
    if url.startswith('http://gutenberg.spiegel.de/buch/'):
        return url.split('/')[-2]

def get_url(id):
    return 'http://gutenberg.spiegel.de/buch/%s/1' % id

def get_page(url):
    data = ox.cache.read_url(url)
    doc = lxml.html.document_fromstring(data)
    gutenb = doc.body.get_element_by_id('gutenb')
    html = lxml.html.tostring(gutenb)
    return html.replace(' id="gutenb"', '')

def get_images(page, html, b=False):
    img = []
    base = ''
    if '<img' in page:
        base = ox.find_re(html, '<base href="(.*?)"')
        for url in re.compile('<img.*?src="(.*?)"').findall(page):
            url = base + url
            img.append(url)
    if b:
        return base, img
    return img

def get_book(id):
    if isinstance(id, basestring) and id.startswith('http'):
        url = id
    else:
        url = get_url(id)
    html = ox.cache.read_url(url, unicode=True)
    data = {}
    data['url'] = url
    pages = []
    page = get_page(url)
    pages.append(page)
    data['base'], data['images'] = get_images(page, html, True)
    info = ox.find_re(html, '<table>.*?</table>')
    for i in re.compile('<tr.*?>(.*?)</tr>').findall(info):
        key, value = i.split('</td><td>')
        data[ox.strip_tags(key)] = ox.strip_tags(value)
    links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
    while links:
        for l in links:
            l = 'http://gutenberg.spiegel.de' + l
            html = ox.cache.read_url(l)
            links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
            page = get_page(l)
            pages.append(page)
            data['images'] += get_images(page, html)
    data['pages'] = pages
    return data

def get_authors():
    url = "http://gutenberg.spiegel.de/autor"
    data = ox.cache.read_url(url, unicode=True)
    authors = {}
    for l,a in re.compile('<a href="/autor/(\d+)">(.*?)</a>').findall(data):
        authors[l] = a
    return authors

def get_books(author_id=None):
    books = {}
    if not author_id:
        url = "http://gutenberg.spiegel.de/buch"
        data = ox.cache.read_url(url, unicode=True)
        for l, t in re.compile('<a href="(/buch.+?)">(.*?)</a>').findall(data):
            l = l.split('/')[-2]
            books[l] = t
    else:
        url = "http://gutenberg.spiegel.de/autor/%s" % author_id
        data = ox.cache.read_url(url, unicode=True)
        for l,t in re.compile('<a href="(.+.xml)">(.*?)</a>').findall(data):
            while l.startswith('../'):
                l = l[3:]
            l = 'http://gutenberg.spiegel.de/' + l
            books[l] = t
    return books

def get_ids():
    return get_books().keys()