python-oxweb/ox/spiegel.py

import re
from time import gmtime, strftime

from BeautifulSoup import BeautifulSoup

from oxutils.cache import getUrl
from oxutils.html import stripTags
from oxutils.text import findRegexp

class Spiegel:
  def __init__(self, year, week):
    # fixme: simply check if cover exists
    thisYear = int(strftime('%Y', gmtime()))
    thisWeek = int(strftime('%W', gmtime()))
    years = range(1994, thisYear + 1)
    if year == thisYear:
      weeks = range(1, thisWeek + 2)
    elif year in [1998, 2004]:
      weeks = range(1, 54)
    else:
      weeks = range(1, 53)
    if year not in years or week not in weeks:
      return None
    # end fixme
    self.year = year
    self.week = week
    self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)
    self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)

  def getContents(self):
    self.contents = []
    soup = BeautifulSoup(getUrl(self.contentsUrl))
    for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
      item = str(item)
      title = stripTags(item).strip()
      print item, '\n'
      page = re.compile('&amp;SE=(.*?)"').findall(item)[0]
      self.contents.append({'title': title, 'page': page})
    return self.contents

if __name__ == '__main__':
  spiegel = Spiegel(2008, 8)
  spiegel.getContents()
  print spiegel.contents
adding spiegel.py 2008-04-30 12:24:33 +00:00			`import re`
			`from time import gmtime, strftime`

			`from BeautifulSoup import BeautifulSoup`

			`from oxutils.cache import getUrl`
			`from oxutils.html import stripTags`
			`from oxutils.text import findRegexp`

			`class Spiegel:`
			`def __init__(self, year, week):`
			`# fixme: simply check if cover exists`
			`thisYear = int(strftime('%Y', gmtime()))`
			`thisWeek = int(strftime('%W', gmtime()))`
			`years = range(1994, thisYear + 1)`
			`if year == thisYear:`
			`weeks = range(1, thisWeek + 2)`
			`elif year in [1998, 2004]:`
			`weeks = range(1, 54)`
			`else:`
			`weeks = range(1, 53)`
			`if year not in years or week not in weeks:`
			`return None`
			`# end fixme`
			`self.year = year`
			`self.week = week`
			`self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)`
			`self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)`

			`def getContents(self):`
			`self.contents = []`
			`soup = BeautifulSoup(getUrl(self.contentsUrl))`
			`for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):`
			`item = str(item)`
			`title = stripTags(item).strip()`
			`print item, '\n'`
			`page = re.compile('&SE=(.*?)"').findall(item)[0]`
			`self.contents.append({'title': title, 'page': page})`
			`return self.contents`

			`if __name__ == '__main__':`
			`spiegel = Spiegel(2008, 8)`
			`spiegel.getContents()`
			`print spiegel.contents`