import re from time import gmtime, strftime from BeautifulSoup import BeautifulSoup from oxutils.cache import getUrl from oxutils.html import stripTags from oxutils.text import findRegexp class Spiegel: def __init__(self, year, week): # fixme: simply check if cover exists thisYear = int(strftime('%Y', gmtime())) thisWeek = int(strftime('%W', gmtime())) years = range(1994, thisYear + 1) if year == thisYear: weeks = range(1, thisWeek + 2) elif year in [1998, 2004]: weeks = range(1, 54) else: weeks = range(1, 53) if year not in years or week not in weeks: return None # end fixme self.year = year self.week = week self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week) self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week) def getContents(self): self.contents = [] soup = BeautifulSoup(getUrl(self.contentsUrl)) for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}): item = str(item) title = stripTags(item).strip() page = re.compile('&SE=(.*?)"').findall(item)[0] self.contents.append({'title': title, 'page': page}) return self.contents if __name__ == '__main__': spiegel = Spiegel(2008, 8) spiegel.getContents() print spiegel.contents