from datetime import datetime import re from BeautifulSoup import BeautifulSoup from oxutils.cache import getUrl from oxutils.html import stripTags import oxutils.net def output(news): for new in news: print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], '']) def getNews(year, month, day): sections = [ 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto' ] dt = datetime(year, month, day) day = int(dt.strftime('%j')) date = dt.strftime('%d.%m.%Y') news = [] for section in sections: url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) print url html = getUrl(url) for item in re.compile('
(.*?)
', re.DOTALL).findall(item)[0]).strip() try: description = re.compile('

(.*?)<', re.DOTALL).findall(item)[0].strip() except: description = '' try: imageUrl = re.compile('', re.DOTALL).findall(item)[0]) new['title0'] = re.compile('(.*?): ').findall(new['title'])[0] new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0] new['url'] = 'http://www.spiegel.de' + re.compile('