From 025297a2316f613cc4f1086a01ed00f58783c81e Mon Sep 17 00:00:00 2001 From: Rolux Date: Wed, 30 Apr 2008 14:24:33 +0200 Subject: [PATCH] adding spiegel.py --- ox/spiegel.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 ox/spiegel.py diff --git a/ox/spiegel.py b/ox/spiegel.py new file mode 100644 index 0000000..2394858 --- /dev/null +++ b/ox/spiegel.py @@ -0,0 +1,44 @@ +import re +from time import gmtime, strftime + +from BeautifulSoup import BeautifulSoup + +from oxutils.cache import getUrl +from oxutils.html import stripTags +from oxutils.text import findRegexp + +class Spiegel: + def __init__(self, year, week): + # fixme: simply check if cover exists + thisYear = int(strftime('%Y', gmtime())) + thisWeek = int(strftime('%W', gmtime())) + years = range(1994, thisYear + 1) + if year == thisYear: + weeks = range(1, thisWeek + 2) + elif year in [1998, 2004]: + weeks = range(1, 54) + else: + weeks = range(1, 53) + if year not in years or week not in weeks: + return None + # end fixme + self.year = year + self.week = week + self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week) + self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week) + + def getContents(self): + self.contents = [] + soup = BeautifulSoup(getUrl(self.contentsUrl)) + for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}): + item = str(item) + title = stripTags(item).strip() + print item, '\n' + page = re.compile('&SE=(.*?)"').findall(item)[0] + self.contents.append({'title': title, 'page': page}) + return self.contents + +if __name__ == '__main__': + spiegel = Spiegel(2008, 8) + spiegel.getContents() + print spiegel.contents \ No newline at end of file