diff --git a/ox/spiegel.py b/ox/spiegel.py index 65831b5..49481ef 100644 --- a/ox/spiegel.py +++ b/ox/spiegel.py @@ -1,42 +1,239 @@ +from datetime import datetime import re -from time import gmtime, strftime +import time from BeautifulSoup import BeautifulSoup -from oxutils.cache import getUrl -from oxutils.html import stripTags +import oxutils.cache +from oxutils.html import decodeHtml, stripTags +import oxutils.net -class Spiegel: - def __init__(self, year, week): - # fixme: simply check if cover exists - thisYear = int(strftime('%Y', gmtime())) - thisWeek = int(strftime('%W', gmtime())) - years = range(1994, thisYear + 1) - if year == thisYear: - weeks = range(1, thisWeek + 2) - elif year in [1998, 2004]: - weeks = range(1, 54) - else: - weeks = range(1, 53) - if year not in years or week not in weeks: - return None - # end fixme - self.year = year - self.week = week - self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week) - self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week) +def getNews(year, month, day): + sections = [ + 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', + 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto' + ] + dt = datetime(year, month, day) + day = int(dt.strftime('%j')) + date = dt.strftime('%d.%m.%Y') + news = [] + for section in sections: + url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) + if date == time.strftime('%d.%m.%Y', time.localtime()): + html = oxutils.net.getUrl(url) + else: + html = oxutils.cache.getUrl(url) + for item in re.compile('
(.*?)
', re.DOTALL).findall(item)[0]).strip() + try: + description = re.compile('

(.*?)<', re.DOTALL).findall(item)[0].strip() + except: + description = '' + try: + imageUrl = re.compile('', re.DOTALL).findall(item)[0]) + except: + title = '' + if dateString[:10] == date and description and imageUrl and title.find(': ') != -1: + new = {} + if len(dateString) == 10: + new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2]) + else: + new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17]) + # fix decodeHtml + # new['description'] = formatString(decodeHtml(description)) + new['description'] = formatString(description) + new['imageUrl'] = imageUrl + new['section'] = formatSection(section) + new['title'] = formatString(title) + new['url'] = 'http://www.spiegel.de' + re.compile('