from datetime import datetime import re import time from BeautifulSoup import BeautifulSoup import oxutils.cache from oxutils.html import decodeHtml, stripTags import oxutils.net def getNews(year, month, day): sections = [ 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto' ] dt = datetime(year, month, day) day = int(dt.strftime('%j')) date = dt.strftime('%d.%m.%Y') news = [] for section in sections: url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) if date == time.strftime('%d.%m.%Y', time.localtime()): html = oxutils.net.getUrl(url) else: html = oxutils.cache.getUrl(url) for item in re.compile('
(.*?)
', re.DOTALL).findall(item)[0]).strip() try: description = formatString(re.compile('

(.*?)<', re.DOTALL).findall(item)[0]) except: description = '' try: imageUrl = re.compile('(.*?)', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf') if new['title1'][-1:] == ':': new['title1'] = new['title1'][0:-1] new['title2'] = new['title'][len(new['title1']) + 2:] new['url'] = re.compile(' ', '') string = string.replace('\n', ' ').replace(' ', ' ').strip() string = string.replace('&', '&').replace(''', '\'').replace('"', '"') return string def formatSection(string): return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL') def formatSubsection(string): # SPIEGEL, SPIEGEL special subsection = { 'abi': 'Abi - und dann?', 'formel1': 'Formel 1', 'jobundberuf': 'Job & Beruf', 'leben': 'Leben U21', 'mensch': 'Mensch & Technik', 'sonst': '', 'staedte': u'St\xc3dte', 'ussports': 'US-Sports', 'wunderbar': 'wunderBAR' } if subsection.has_key(string): return subsection[string].replace(u'\xc3', 'ae') return string[:1].upper() + string[1:] def getIssue(year, week): coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week) if not oxutils.net.exists(coverUrl): return None url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week) contents = [] soup = BeautifulSoup(oxutils.cache.getUrl(url)) for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}): item = str(item) page = int(re.compile('&SE=(.*?)"').findall(item)[0]) title = stripTags(item).strip() contents.append({'title': title, 'page': page}) pageUrl = {} pages = page + 2 for page in range(1, pages + 10): url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page) if oxutils.cache.exists(url): pageUrl[page] = url else: pageUrl[page] = '' return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl} def archiveIssues(): ''' this is just an example of an archiving application ''' p = {} import os import simplejson import time archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel' localtime = time.localtime() year = int(time.strftime('%Y', localtime)) week = int(time.strftime('%W', localtime)) for y in range(year, 1993, -1): if y == year: wMax = week + 1 else: wMax = 53 for w in range(wMax, 0, -1): print 'getIssue(%d, %d)' % (y, w) issue = getIssue(y, w) if issue: dirname = '%s/%d/%02d' % (archivePath, y, w) if not os.path.exists(dirname): os.makedirs(dirname) filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w) if not os.path.exists(filename): data = simplejson.dumps(issue, ensure_ascii = False) f = open(filename, 'w') f.write(data) f.close() filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w) if not os.path.exists(filename): data = [] for item in issue['contents']: data.append('%3d %s' % (item['page'], item['title'])) data = '\n'.join(data) f = open(filename, 'w') f.write(data) f.close() filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w) if not os.path.exists(filename): data = oxutils.cache.getUrl(issue['coverUrl']) f = open(filename, 'w') f.write(data) f.close() for page in issue['pageUrl']: url = issue['pageUrl'][page] if url: filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page) if not os.path.exists(filename): data = oxutils.cache.getUrl(url) f = open(filename, 'w') f.write(data) f.close() if not p: p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']} else: p['num'] += 1 p['sum'] += issue['pages'] if issue['pages'] < p['min']: p['min'] = issue['pages'] if issue['pages'] > p['max']: p['max'] = issue['pages'] print p['min'], p['sum'] / p['num'], p['max'] def archiveNews(): ''' this is just an example of an archiving application ''' import os import simplejson import time count = {} colon = [] archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online' days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] localtime = time.localtime() year = int(time.strftime('%Y', localtime)) month = int(time.strftime('%m', localtime)) day = int(time.strftime('%d', localtime)) - 1 for y in range(year, 1999, -1): if y == year: mMax = month else: mMax = 12 for m in range(mMax, 0, -1): if y == year and m == month: dMax = day elif m == 2 and y % 4 == 0 and y % 400 != 0: dMax = days[m] + 1 else: dMax = days[m] for d in range(dMax, 0, -1): print 'getNews(%d, %d, %d)' % (y, m, d) news = getNews(y, m ,d) for new in news: dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16] if not os.path.exists(dirname): os.makedirs(dirname) if new['url'][-5:] == '.html': filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json' else: filename = dirname + '/' + new['url'] + '.json' if not os.path.exists(filename) or True: data = simplejson.dumps(new, ensure_ascii = False) f = open(filename, 'w') f.write(data) f.close() filename = filename[:-5] + '.txt' if not os.path.exists(filename) or True: data = splitTitle(new['title']) data.append(new['description']) data = '\n'.join(data) f = open(filename, 'w') f.write(data) f.close() filename = dirname + '/' + new['imageUrl'].split('/')[-1] if not os.path.exists(filename): data = oxutils.cache.getUrl(new['imageUrl']) f = open(filename, 'w') f.write(data) f.close() strings = new['url'].split('/') string = strings[3] if len(strings) == 6: string += '/' + strings[4] if not count.has_key(string): count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))} else: count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])} strings = splitTitle(new['title']) if strings[0] != new['title1'] or strings[1] != new['title2']: colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2'])) for key in sortDictByKey(count): print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string']) for value in colon: print value def sortDictByKey(d): keys = d.keys() keys.sort() return keys if __name__ == '__main__': # spiegel = Spiegel(2008, 8) # print spiegel.getContents() # news = News(2001, 9, 10) # output(news.getNews()) ''' x = [] for d in range(10, 30): print '2/%d' % d news = getNews(2008, 2, d) for new in news: strings = new['url'].split('/') string = formatSection(strings[3]) if len(strings) == 6: string += '/' + formatSubsection(strings[4]) if not string in x: x.append(string) print x ''' # archiveIssues() archiveNews()