spiegel.py news

This commit is contained in:
Rolux 2008-04-30 20:51:27 +02:00
parent 28d84dd3eb
commit a360bba9b5

View file

@ -1,42 +1,154 @@
from datetime import datetime
import re import re
from time import gmtime, strftime
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
from oxutils.cache import getUrl from oxutils.cache import getUrl
from oxutils.html import stripTags from oxutils.html import stripTags
import oxutils.net
def output(news):
for new in news:
print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], ''])
def getNews(year, month, day):
sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
]
dt = datetime(year, month, day)
day = int(dt.strftime('%j'))
date = dt.strftime('%d.%m.%Y')
news = []
for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
print url
html = getUrl(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try:
description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip()
except:
description = ''
try:
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
except:
imageUrl = ''
if dateString[:10] == date and description and imageUrl:
# print item
new = {}
if len(dateString) == 10:
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
new['dateString'] = dateString
new['description'] = formatString(description)
new['imageUrl'] = imageUrl
new['section'] = formatSection(section)
new['title'] = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
new['title0'] = re.compile('(.*?): ').findall(new['title'])[0]
new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0]
new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
news.append(new)
print dateString + ' - ok'
elif not description:
print dateString + ' - no description'
elif not imageUrl:
print dateString + ' - no image'
return news
def formatString(string):
return string.replace('\n', ' ').replace(' ', ' ').strip()
def formatSection(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
def formatSubsection(string):
subsection = {
'abi': 'Abi - und dann?',
'formel1': 'Formel 1',
'jobundberuf': 'Job & Beruf',
'leben': 'Leben U21',
'mensch': 'Mensch & Technik',
'sonst': '',
'staedte': u'St\xc3dte',
'ussports': 'US-Sports',
'wunderbar': 'wunderBAR'
}
if subsection.has_key(string):
return subsection[string]
return string[:1].upper() + string[1:]
class Spiegel: class Spiegel:
def __init__(self, year, week): def __init__(self, year, week):
# fixme: simply check if cover exists self.year = year
thisYear = int(strftime('%Y', gmtime())) self.week = week
thisWeek = int(strftime('%W', gmtime())) self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)
years = range(1994, thisYear + 1) if not oxutils.net.exists(self.coverUrl):
if year == thisYear: self.coverUrl = ''
weeks = range(1, thisWeek + 2) return
elif year in [1998, 2004]: self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)
weeks = range(1, 54)
else: def getContents(self):
weeks = range(1, 53) self.contents = []
if year not in years or week not in weeks: soup = BeautifulSoup(getUrl(self.contentsUrl))
return None for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
# end fixme item = str(item)
self.year = year title = stripTags(item).strip()
self.week = week page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week) self.contents.append({'title': title, 'page': page})
self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week) return self.contents
def archiveNews():
'''
this is just an example of an archiving application
'''
import os
import simplejson
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de'
for y in range(2007, 2008):
for m in range(1, 13):
for d in range(1, 32):
news = getNews(y, m ,d)
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
data = simplejson.dumps(new, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = filename[:-4] + 'txt'
data = '\n'.join([new['title0'], new['title1'], new['description']])
f = open(filename, 'w')
f.write(data)
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
data = getUrl(new['imageUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
def getContents(self):
self.contents = []
soup = BeautifulSoup(getUrl(self.contentsUrl))
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
item = str(item)
title = stripTags(item).strip()
page = re.compile('&amp;SE=(.*?)"').findall(item)[0]
self.contents.append({'title': title, 'page': page})
return self.contents
if __name__ == '__main__': if __name__ == '__main__':
spiegel = Spiegel(2008, 8) # spiegel = Spiegel(2008, 8)
spiegel.getContents() # print spiegel.getContents()
print spiegel.contents # news = News(2001, 9, 10)
# output(news.getNews())
'''
x = []
for d in range(10, 30):
print '2/%d' % d
news = getNews(2008, 2, d)
for new in news:
strings = new['url'].split('/')
string = formatSection(strings[3])
if len(strings) == 6:
string += '/' + formatSubsection(strings[4])
if not string in x:
x.append(string)
print x
'''
archiveNews()