openmedialibrary_platform/Shared/lib/python2.7/site-packages/ox/web/gutenbergde.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re

import lxml.html

import ox
from ox.cache import read_url, cache_timeout


def get_id(url):
    if url.startswith('http://gutenberg.spiegel.de/buch/'):
        return url.split('/')[-2]

def get_url(id):
    return 'http://gutenberg.spiegel.de/buch/%s/1' % id

def get_page(url):
    data = ox.cache.read_url(url)
    doc = lxml.html.document_fromstring(data)
    gutenb = doc.body.get_element_by_id('gutenb')
    html = lxml.html.tostring(gutenb)
    return html.replace(' id="gutenb"', '')

def get_images(page, html, b=False):
    img = []
    base = ''
    if '<img' in page:
        base = ox.find_re(html, '<base href="(.*?)"')
        for url in re.compile('<img.*?src="(.*?)"').findall(page):
            url = base + url
            img.append(url)
    if b:
        return base, img
    return img

def get_book(id):
    if isinstance(id, basestring) and id.startswith('http'):
        url = id
    else:
        url = get_url(id)
    html = ox.cache.read_url(url, unicode=True)
    data = {}
    data['url'] = url
    pages = []
    page = get_page(url)
    pages.append(page)
    data['base'], data['images'] = get_images(page, html, True)
    info = ox.find_re(html, '<table>.*?</table>')
    for i in re.compile('<tr.*?>(.*?)</tr>').findall(info):
        key, value = i.split('</td><td>')
        data[ox.strip_tags(key)] = ox.strip_tags(value)
    links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
    while links:
        for l in links:
            l = 'http://gutenberg.spiegel.de' + l
            html = ox.cache.read_url(l)
            links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
            page = get_page(l)
            pages.append(page)
            data['images'] += get_images(page, html)
    data['pages'] = pages
    return data

def get_authors():
    url = "http://gutenberg.spiegel.de/autor"
    data = ox.cache.read_url(url, unicode=True)
    authors = {}
    for l,a in re.compile('<a href="/autor/(\d+)">(.*?)</a>').findall(data):
        authors[l] = a
    return authors

def get_books(author_id=None):
    books = {}
    if not author_id:
        url = "http://gutenberg.spiegel.de/buch"
        data = ox.cache.read_url(url, unicode=True)
        for l, t in re.compile('<a href="(/buch.+?)">(.*?)</a>').findall(data):
            l = l.split('/')[-2]
            books[l] = t
    else:
        url = "http://gutenberg.spiegel.de/autor/%s" % author_id
        data = ox.cache.read_url(url, unicode=True)
        for l,t in re.compile('<a href="(.+.xml)">(.*?)</a>').findall(data):
            while l.startswith('../'):
                l = l[3:]
            l = 'http://gutenberg.spiegel.de/' + l
            books[l] = t
    return books

def get_ids():
    return get_books().keys()
Open Media Library Platform 2013-10-11 17:28:32 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`import re`

			`import lxml.html`

			`import ox`
			`from ox.cache import read_url, cache_timeout`


			`def get_id(url):`
			`if url.startswith('http://gutenberg.spiegel.de/buch/'):`
			`return url.split('/')[-2]`

			`def get_url(id):`
			`return 'http://gutenberg.spiegel.de/buch/%s/1' % id`

			`def get_page(url):`
			`data = ox.cache.read_url(url)`
			`doc = lxml.html.document_fromstring(data)`
			`gutenb = doc.body.get_element_by_id('gutenb')`
			`html = lxml.html.tostring(gutenb)`
			`return html.replace(' id="gutenb"', '')`

			`def get_images(page, html, b=False):`
			`img = []`
			`base = ''`
			`if '<img' in page:`
			`base = ox.find_re(html, '<base href="(.*?)"')`
			`for url in re.compile('<img.?src="(.?)"').findall(page):`
			`url = base + url`
			`img.append(url)`
			`if b:`
			`return base, img`
			`return img`

			`def get_book(id):`
			`if isinstance(id, basestring) and id.startswith('http'):`
			`url = id`
			`else:`
			`url = get_url(id)`
			`html = ox.cache.read_url(url, unicode=True)`
			`data = {}`
			`data['url'] = url`
			`pages = []`
			`page = get_page(url)`
			`pages.append(page)`
			`data['base'], data['images'] = get_images(page, html, True)`
			`info = ox.find_re(html, '<table>.*?</table>')`
			`for i in re.compile('<tr.?>(.?)</tr>').findall(info):`
			`key, value = i.split('</td><td>')`
			`data[ox.strip_tags(key)] = ox.strip_tags(value)`
			`links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)`
			`while links:`
			`for l in links:`
			`l = 'http://gutenberg.spiegel.de' + l`
			`html = ox.cache.read_url(l)`
			`links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)`
			`page = get_page(l)`
			`pages.append(page)`
			`data['images'] += get_images(page, html)`
			`data['pages'] = pages`
			`return data`

			`def get_authors():`
			`url = "http://gutenberg.spiegel.de/autor"`
			`data = ox.cache.read_url(url, unicode=True)`
			`authors = {}`
			`for l,a in re.compile('<a href="/autor/(\d+)">(.*?)</a>').findall(data):`
			`authors[l] = a`
			`return authors`

			`def get_books(author_id=None):`
			`books = {}`
			`if not author_id:`
			`url = "http://gutenberg.spiegel.de/buch"`
			`data = ox.cache.read_url(url, unicode=True)`
			`for l, t in re.compile('<a href="(/buch.+?)">(.*?)</a>').findall(data):`
			`l = l.split('/')[-2]`
			`books[l] = t`
			`else:`
			`url = "http://gutenberg.spiegel.de/autor/%s" % author_id`
			`data = ox.cache.read_url(url, unicode=True)`
			`for l,t in re.compile('<a href="(.+.xml)">(.*?)</a>').findall(data):`
			`while l.startswith('../'):`
			`l = l[3:]`
			`l = 'http://gutenberg.spiegel.de/' + l`
			`books[l] = t`
			`return books`

			`def get_ids():`
			`return get_books().keys()`