From a360bba9b5aa4d6f6787bdaf0813be9893fd7e7f Mon Sep 17 00:00:00 2001 From: Rolux Date: Wed, 30 Apr 2008 20:51:27 +0200 Subject: [PATCH 1/3] spiegel.py news --- ox/spiegel.py | 174 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 143 insertions(+), 31 deletions(-) diff --git a/ox/spiegel.py b/ox/spiegel.py index 65831b5..aae86e1 100644 --- a/ox/spiegel.py +++ b/ox/spiegel.py @@ -1,42 +1,154 @@ +from datetime import datetime import re -from time import gmtime, strftime from BeautifulSoup import BeautifulSoup from oxutils.cache import getUrl from oxutils.html import stripTags +import oxutils.net + +def output(news): + for new in news: + print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], '']) + +def getNews(year, month, day): + sections = [ + 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', + 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto' + ] + dt = datetime(year, month, day) + day = int(dt.strftime('%j')) + date = dt.strftime('%d.%m.%Y') + news = [] + for section in sections: + url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) + print url + html = getUrl(url) + for item in re.compile('
(.*?)
', re.DOTALL).findall(item)[0]).strip() + try: + description = re.compile('

(.*?)<', re.DOTALL).findall(item)[0].strip() + except: + description = '' + try: + imageUrl = re.compile('', re.DOTALL).findall(item)[0]) + new['title0'] = re.compile('(.*?): ').findall(new['title'])[0] + new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0] + new['url'] = 'http://www.spiegel.de' + re.compile(' Date: Thu, 1 May 2008 01:27:50 +0200 Subject: [PATCH 2/3] spiegel.py: cleanup --- ox/spiegel.py | 170 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 51 deletions(-) diff --git a/ox/spiegel.py b/ox/spiegel.py index aae86e1..fe0bc72 100644 --- a/ox/spiegel.py +++ b/ox/spiegel.py @@ -1,16 +1,13 @@ from datetime import datetime import re +import time from BeautifulSoup import BeautifulSoup -from oxutils.cache import getUrl -from oxutils.html import stripTags +import oxutils.cache +from oxutils.html import decodeHtml, stripTags import oxutils.net -def output(news): - for new in news: - print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], '']) - def getNews(year, month, day): sections = [ 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', @@ -22,8 +19,10 @@ def getNews(year, month, day): news = [] for section in sections: url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) - print url - html = getUrl(url) + if date == time.strftime('%d.%m.%Y', time.localtime()): + html = oxutils.net.getUrl(url) + else: + html = oxutils.cache.getUrl(url) for item in re.compile('

(.*?)
', re.DOTALL).findall(item)[0]).strip() try: @@ -34,31 +33,40 @@ def getNews(year, month, day): imageUrl = re.compile('', re.DOTALL).findall(item)[0]) + except: + title = '' + if dateString[:10] == date and description and imageUrl and title.find(': ') != -1: new = {} if len(dateString) == 10: new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2]) else: new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17]) - new['dateString'] = dateString + # fix decodeHtml + # new['description'] = formatString(decodeHtml(description)) new['description'] = formatString(description) new['imageUrl'] = imageUrl new['section'] = formatSection(section) - new['title'] = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0]) - new['title0'] = re.compile('(.*?): ').findall(new['title'])[0] - new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0] + new['title'] = formatString(title) new['url'] = 'http://www.spiegel.de' + re.compile(' Date: Thu, 1 May 2008 11:14:10 +0200 Subject: [PATCH 3/3] spiegel.py: get actual pages --- ox/spiegel.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/ox/spiegel.py b/ox/spiegel.py index fe0bc72..49481ef 100644 --- a/ox/spiegel.py +++ b/ox/spiegel.py @@ -88,7 +88,7 @@ def formatSubsection(string): return string[:1].upper() + string[1:] def getMagazine(year, week): - coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (year, week, year, week) + coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week) if not oxutils.net.exists(coverUrl): return None url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week) @@ -99,7 +99,15 @@ def getMagazine(year, week): page = int(re.compile('&SE=(.*?)"').findall(item)[0]) title = stripTags(item).strip() contents.append({'title': title, 'page': page}) - return {'contents': contents, 'coverUrl': coverUrl} + pageUrl = {} + pages = page + 2 + for page in range(1, pages + 10): + url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page) + if oxutils.net.exists(url): + pageUrl[page] = url + else: + pageUrl[page] = '' + return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl} def archiveMagazines(): @@ -125,13 +133,13 @@ def archiveMagazines(): dirname = '%s/%d/%02d' % (archivePath, y, w) if not os.path.exists(dirname): os.makedirs(dirname) - filename = '%s/Der Spiegel %d-%02d.json' % (dirname, y, w) + filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w) if not os.path.exists(filename) or True: data = simplejson.dumps(magazine, ensure_ascii = False) f = open(filename, 'w') f.write(data) f.close() - filename = '%s/Der Spiegel %d-%02d.txt' % (dirname, y, w) + filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w) if not os.path.exists(filename) or True: data = [] for item in magazine['contents']: @@ -140,12 +148,21 @@ def archiveMagazines(): f = open(filename, 'w') f.write(data) f.close() - filename = '%s/Der Spiegel %d-%02d.jpg' % (dirname, y, w) + filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w) if not os.path.exists(filename): data = oxutils.cache.getUrl(magazine['coverUrl']) f = open(filename, 'w') f.write(data) f.close() + for page in magazine['pageUrl']: + url = magazine['pageUrl'][page] + if url: + filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page) + if not os.path.exists(filename): + data = oxutils.cache.getUrl(url) + f = open(filename, 'w') + f.write(data) + f.close() def archiveNews():