diff --git a/ox/spiegel.py b/ox/spiegel.py index aae86e1..fe0bc72 100644 --- a/ox/spiegel.py +++ b/ox/spiegel.py @@ -1,16 +1,13 @@ from datetime import datetime import re +import time from BeautifulSoup import BeautifulSoup -from oxutils.cache import getUrl -from oxutils.html import stripTags +import oxutils.cache +from oxutils.html import decodeHtml, stripTags import oxutils.net -def output(news): - for new in news: - print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], '']) - def getNews(year, month, day): sections = [ 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', @@ -22,8 +19,10 @@ def getNews(year, month, day): news = [] for section in sections: url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) - print url - html = getUrl(url) + if date == time.strftime('%d.%m.%Y', time.localtime()): + html = oxutils.net.getUrl(url) + else: + html = oxutils.cache.getUrl(url) for item in re.compile('
(.*?)
', re.DOTALL).findall(item)[0]).strip() try: @@ -34,31 +33,40 @@ def getNews(year, month, day): imageUrl = re.compile('', re.DOTALL).findall(item)[0]) + except: + title = '' + if dateString[:10] == date and description and imageUrl and title.find(': ') != -1: new = {} if len(dateString) == 10: new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2]) else: new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17]) - new['dateString'] = dateString + # fix decodeHtml + # new['description'] = formatString(decodeHtml(description)) new['description'] = formatString(description) new['imageUrl'] = imageUrl new['section'] = formatSection(section) - new['title'] = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0]) - new['title0'] = re.compile('(.*?): ').findall(new['title'])[0] - new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0] + new['title'] = formatString(title) new['url'] = 'http://www.spiegel.de' + re.compile('