Open Media Library Platform

2013-10-11 19:28:32 +02:00 · 2013-10-11 19:28:32 +02:00 · 411ad5b16f
commit 411ad5b16f
5849 changed files with 1778641 additions and 0 deletions
--- a/Shared/lib/python2.7/site-packages/ox/web/gutenbergde.py
+++ b/Shared/lib/python2.7/site-packages/ox/web/gutenbergde.py
@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+
+import lxml.html
+
+import ox
+from ox.cache import read_url, cache_timeout
+
+
+def get_id(url):
+    if url.startswith('http://gutenberg.spiegel.de/buch/'):
+        return url.split('/')[-2]
+
+def get_url(id):
+    return 'http://gutenberg.spiegel.de/buch/%s/1' % id
+
+def get_page(url):
+    data = ox.cache.read_url(url)
+    doc = lxml.html.document_fromstring(data)
+    gutenb = doc.body.get_element_by_id('gutenb')
+    html = lxml.html.tostring(gutenb)
+    return html.replace(' id="gutenb"', '')
+
+def get_images(page, html, b=False):
+    img = []
+    base = ''
+    if '<img' in page:
+        base = ox.find_re(html, '<base href="(.*?)"')
+        for url in re.compile('<img.*?src="(.*?)"').findall(page):
+            url = base + url
+            img.append(url)
+    if b:
+        return base, img
+    return img
+
+def get_book(id):
+    if isinstance(id, basestring) and id.startswith('http'):
+        url = id
+    else:
+        url = get_url(id)
+    html = ox.cache.read_url(url, unicode=True)
+    data = {}
+    data['url'] = url
+    pages = []
+    page = get_page(url)
+    pages.append(page)
+    data['base'], data['images'] = get_images(page, html, True)
+    info = ox.find_re(html, '<table>.*?</table>')
+    for i in re.compile('<tr.*?>(.*?)</tr>').findall(info):
+        key, value = i.split('</td><td>')
+        data[ox.strip_tags(key)] = ox.strip_tags(value)
+    links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
+    while links:
+        for l in links:
+            l = 'http://gutenberg.spiegel.de' + l
+            html = ox.cache.read_url(l)
+            links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
+            page = get_page(l)
+            pages.append(page)
+            data['images'] += get_images(page, html)
+    data['pages'] = pages
+    return data
+
+def get_authors():
+    url = "http://gutenberg.spiegel.de/autor"
+    data = ox.cache.read_url(url, unicode=True)
+    authors = {}
+    for l,a in re.compile('<a href="/autor/(\d+)">(.*?)</a>').findall(data):
+        authors[l] = a
+    return authors
+
+def get_books(author_id=None):
+    books = {}
+    if not author_id:
+        url = "http://gutenberg.spiegel.de/buch"
+        data = ox.cache.read_url(url, unicode=True)
+        for l, t in re.compile('<a href="(/buch.+?)">(.*?)</a>').findall(data):
+            l = l.split('/')[-2]
+            books[l] = t
+    else:
+        url = "http://gutenberg.spiegel.de/autor/%s" % author_id
+        data = ox.cache.read_url(url, unicode=True)
+        for l,t in re.compile('<a href="(.+.xml)">(.*?)</a>').findall(data):
+            while l.startswith('../'):
+                l = l[3:]
+            l = 'http://gutenberg.spiegel.de/' + l
+            books[l] = t
+    return books
+
+def get_ids():
+    return get_books().keys()