diff --git a/ox/spiegel.py b/ox/spiegel.py index 65831b5..aae86e1 100644 --- a/ox/spiegel.py +++ b/ox/spiegel.py @@ -1,42 +1,154 @@ +from datetime import datetime import re -from time import gmtime, strftime from BeautifulSoup import BeautifulSoup from oxutils.cache import getUrl from oxutils.html import stripTags +import oxutils.net + +def output(news): + for new in news: + print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], '']) + +def getNews(year, month, day): + sections = [ + 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', + 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto' + ] + dt = datetime(year, month, day) + day = int(dt.strftime('%j')) + date = dt.strftime('%d.%m.%Y') + news = [] + for section in sections: + url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) + print url + html = getUrl(url) + for item in re.compile('
(.*?)
', re.DOTALL).findall(item)[0]).strip() + try: + description = re.compile('

(.*?)<', re.DOTALL).findall(item)[0].strip() + except: + description = '' + try: + imageUrl = re.compile('', re.DOTALL).findall(item)[0]) + new['title0'] = re.compile('(.*?): ').findall(new['title'])[0] + new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0] + new['url'] = 'http://www.spiegel.de' + re.compile('