from datetime import datetime import re import time from BeautifulSoup import BeautifulSoup import oxutils.cache from oxutils.html import decodeHtml, stripTags import oxutils.net def getNews(year, month, day): sections = [ 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto' ] dt = datetime(year, month, day) day = int(dt.strftime('%j')) date = dt.strftime('%d.%m.%Y') news = [] for section in sections: url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) if date == time.strftime('%d.%m.%Y', time.localtime()): html = oxutils.net.getUrl(url) else: html = oxutils.cache.getUrl(url) for item in re.compile('
(.*?)
', re.DOTALL).findall(item)[0]).strip() try: description = re.compile('

(.*?)<', re.DOTALL).findall(item)[0].strip() except: description = '' try: imageUrl = re.compile('', re.DOTALL).findall(item)[0]) except: title = '' if dateString[:10] == date and description and imageUrl and title.find(': ') != -1: new = {} if len(dateString) == 10: new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2]) else: new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17]) # fix decodeHtml # new['description'] = formatString(decodeHtml(description)) new['description'] = formatString(description) new['imageUrl'] = imageUrl new['section'] = formatSection(section) new['title'] = formatString(title) new['url'] = 'http://www.spiegel.de' + re.compile('