python-oxweb/ox/spiegel.py
2008-04-30 14:24:33 +02:00

44 lines
No EOL
1.5 KiB
Python

import re
from time import gmtime, strftime
from BeautifulSoup import BeautifulSoup
from oxutils.cache import getUrl
from oxutils.html import stripTags
from oxutils.text import findRegexp
class Spiegel:
def __init__(self, year, week):
# fixme: simply check if cover exists
thisYear = int(strftime('%Y', gmtime()))
thisWeek = int(strftime('%W', gmtime()))
years = range(1994, thisYear + 1)
if year == thisYear:
weeks = range(1, thisWeek + 2)
elif year in [1998, 2004]:
weeks = range(1, 54)
else:
weeks = range(1, 53)
if year not in years or week not in weeks:
return None
# end fixme
self.year = year
self.week = week
self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)
self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)
def getContents(self):
self.contents = []
soup = BeautifulSoup(getUrl(self.contentsUrl))
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
item = str(item)
title = stripTags(item).strip()
print item, '\n'
page = re.compile('&SE=(.*?)"').findall(item)[0]
self.contents.append({'title': title, 'page': page})
return self.contents
if __name__ == '__main__':
spiegel = Spiegel(2008, 8)
spiegel.getContents()
print spiegel.contents