Open Media Library Platform
This commit is contained in:
commit
411ad5b16f
5849 changed files with 1778641 additions and 0 deletions
92
Shared/lib/python2.7/site-packages/ox/web/gutenbergde.py
Normal file
92
Shared/lib/python2.7/site-packages/ox/web/gutenbergde.py
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
import lxml.html
|
||||
|
||||
import ox
|
||||
from ox.cache import read_url, cache_timeout
|
||||
|
||||
|
||||
def get_id(url):
|
||||
if url.startswith('http://gutenberg.spiegel.de/buch/'):
|
||||
return url.split('/')[-2]
|
||||
|
||||
def get_url(id):
|
||||
return 'http://gutenberg.spiegel.de/buch/%s/1' % id
|
||||
|
||||
def get_page(url):
|
||||
data = ox.cache.read_url(url)
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
gutenb = doc.body.get_element_by_id('gutenb')
|
||||
html = lxml.html.tostring(gutenb)
|
||||
return html.replace(' id="gutenb"', '')
|
||||
|
||||
def get_images(page, html, b=False):
|
||||
img = []
|
||||
base = ''
|
||||
if '<img' in page:
|
||||
base = ox.find_re(html, '<base href="(.*?)"')
|
||||
for url in re.compile('<img.*?src="(.*?)"').findall(page):
|
||||
url = base + url
|
||||
img.append(url)
|
||||
if b:
|
||||
return base, img
|
||||
return img
|
||||
|
||||
def get_book(id):
|
||||
if isinstance(id, basestring) and id.startswith('http'):
|
||||
url = id
|
||||
else:
|
||||
url = get_url(id)
|
||||
html = ox.cache.read_url(url, unicode=True)
|
||||
data = {}
|
||||
data['url'] = url
|
||||
pages = []
|
||||
page = get_page(url)
|
||||
pages.append(page)
|
||||
data['base'], data['images'] = get_images(page, html, True)
|
||||
info = ox.find_re(html, '<table>.*?</table>')
|
||||
for i in re.compile('<tr.*?>(.*?)</tr>').findall(info):
|
||||
key, value = i.split('</td><td>')
|
||||
data[ox.strip_tags(key)] = ox.strip_tags(value)
|
||||
links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
|
||||
while links:
|
||||
for l in links:
|
||||
l = 'http://gutenberg.spiegel.de' + l
|
||||
html = ox.cache.read_url(l)
|
||||
links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
|
||||
page = get_page(l)
|
||||
pages.append(page)
|
||||
data['images'] += get_images(page, html)
|
||||
data['pages'] = pages
|
||||
return data
|
||||
|
||||
def get_authors():
|
||||
url = "http://gutenberg.spiegel.de/autor"
|
||||
data = ox.cache.read_url(url, unicode=True)
|
||||
authors = {}
|
||||
for l,a in re.compile('<a href="/autor/(\d+)">(.*?)</a>').findall(data):
|
||||
authors[l] = a
|
||||
return authors
|
||||
|
||||
def get_books(author_id=None):
|
||||
books = {}
|
||||
if not author_id:
|
||||
url = "http://gutenberg.spiegel.de/buch"
|
||||
data = ox.cache.read_url(url, unicode=True)
|
||||
for l, t in re.compile('<a href="(/buch.+?)">(.*?)</a>').findall(data):
|
||||
l = l.split('/')[-2]
|
||||
books[l] = t
|
||||
else:
|
||||
url = "http://gutenberg.spiegel.de/autor/%s" % author_id
|
||||
data = ox.cache.read_url(url, unicode=True)
|
||||
for l,t in re.compile('<a href="(.+.xml)">(.*?)</a>').findall(data):
|
||||
while l.startswith('../'):
|
||||
l = l[3:]
|
||||
l = 'http://gutenberg.spiegel.de/' + l
|
||||
books[l] = t
|
||||
return books
|
||||
|
||||
def get_ids():
|
||||
return get_books().keys()
|
||||
Loading…
Add table
Add a link
Reference in a new issue