spiegel.py news
This commit is contained in:
parent
28d84dd3eb
commit
a360bba9b5
1 changed files with 143 additions and 31 deletions
174
ox/spiegel.py
174
ox/spiegel.py
|
@ -1,42 +1,154 @@
|
|||
from datetime import datetime
|
||||
import re
|
||||
from time import gmtime, strftime
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
from oxutils.cache import getUrl
|
||||
from oxutils.html import stripTags
|
||||
import oxutils.net
|
||||
|
||||
def output(news):
|
||||
for new in news:
|
||||
print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], ''])
|
||||
|
||||
def getNews(year, month, day):
|
||||
sections = [
|
||||
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
||||
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
|
||||
]
|
||||
dt = datetime(year, month, day)
|
||||
day = int(dt.strftime('%j'))
|
||||
date = dt.strftime('%d.%m.%Y')
|
||||
news = []
|
||||
for section in sections:
|
||||
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
|
||||
print url
|
||||
html = getUrl(url)
|
||||
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||
try:
|
||||
description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip()
|
||||
except:
|
||||
description = ''
|
||||
try:
|
||||
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
|
||||
except:
|
||||
imageUrl = ''
|
||||
if dateString[:10] == date and description and imageUrl:
|
||||
# print item
|
||||
new = {}
|
||||
if len(dateString) == 10:
|
||||
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
||||
else:
|
||||
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||
new['dateString'] = dateString
|
||||
new['description'] = formatString(description)
|
||||
new['imageUrl'] = imageUrl
|
||||
new['section'] = formatSection(section)
|
||||
new['title'] = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
|
||||
new['title0'] = re.compile('(.*?): ').findall(new['title'])[0]
|
||||
new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0]
|
||||
new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
|
||||
news.append(new)
|
||||
print dateString + ' - ok'
|
||||
elif not description:
|
||||
print dateString + ' - no description'
|
||||
elif not imageUrl:
|
||||
print dateString + ' - no image'
|
||||
return news
|
||||
|
||||
def formatString(string):
|
||||
return string.replace('\n', ' ').replace(' ', ' ').strip()
|
||||
|
||||
def formatSection(string):
|
||||
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
||||
|
||||
def formatSubsection(string):
|
||||
subsection = {
|
||||
'abi': 'Abi - und dann?',
|
||||
'formel1': 'Formel 1',
|
||||
'jobundberuf': 'Job & Beruf',
|
||||
'leben': 'Leben U21',
|
||||
'mensch': 'Mensch & Technik',
|
||||
'sonst': '',
|
||||
'staedte': u'St\xc3dte',
|
||||
'ussports': 'US-Sports',
|
||||
'wunderbar': 'wunderBAR'
|
||||
}
|
||||
if subsection.has_key(string):
|
||||
return subsection[string]
|
||||
return string[:1].upper() + string[1:]
|
||||
|
||||
|
||||
class Spiegel:
|
||||
def __init__(self, year, week):
|
||||
# fixme: simply check if cover exists
|
||||
thisYear = int(strftime('%Y', gmtime()))
|
||||
thisWeek = int(strftime('%W', gmtime()))
|
||||
years = range(1994, thisYear + 1)
|
||||
if year == thisYear:
|
||||
weeks = range(1, thisWeek + 2)
|
||||
elif year in [1998, 2004]:
|
||||
weeks = range(1, 54)
|
||||
else:
|
||||
weeks = range(1, 53)
|
||||
if year not in years or week not in weeks:
|
||||
return None
|
||||
# end fixme
|
||||
self.year = year
|
||||
self.week = week
|
||||
self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)
|
||||
self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)
|
||||
def __init__(self, year, week):
|
||||
self.year = year
|
||||
self.week = week
|
||||
self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)
|
||||
if not oxutils.net.exists(self.coverUrl):
|
||||
self.coverUrl = ''
|
||||
return
|
||||
self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)
|
||||
|
||||
def getContents(self):
|
||||
self.contents = []
|
||||
soup = BeautifulSoup(getUrl(self.contentsUrl))
|
||||
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
|
||||
item = str(item)
|
||||
title = stripTags(item).strip()
|
||||
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
||||
self.contents.append({'title': title, 'page': page})
|
||||
return self.contents
|
||||
|
||||
def archiveNews():
|
||||
'''
|
||||
this is just an example of an archiving application
|
||||
'''
|
||||
import os
|
||||
import simplejson
|
||||
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de'
|
||||
for y in range(2007, 2008):
|
||||
for m in range(1, 13):
|
||||
for d in range(1, 32):
|
||||
news = getNews(y, m ,d)
|
||||
for new in news:
|
||||
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
|
||||
data = simplejson.dumps(new, ensure_ascii = False)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = filename[:-4] + 'txt'
|
||||
data = '\n'.join([new['title0'], new['title1'], new['description']])
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
|
||||
data = getUrl(new['imageUrl'])
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
def getContents(self):
|
||||
self.contents = []
|
||||
soup = BeautifulSoup(getUrl(self.contentsUrl))
|
||||
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
|
||||
item = str(item)
|
||||
title = stripTags(item).strip()
|
||||
page = re.compile('&SE=(.*?)"').findall(item)[0]
|
||||
self.contents.append({'title': title, 'page': page})
|
||||
return self.contents
|
||||
|
||||
if __name__ == '__main__':
|
||||
spiegel = Spiegel(2008, 8)
|
||||
spiegel.getContents()
|
||||
print spiegel.contents
|
||||
# spiegel = Spiegel(2008, 8)
|
||||
# print spiegel.getContents()
|
||||
# news = News(2001, 9, 10)
|
||||
# output(news.getNews())
|
||||
'''
|
||||
x = []
|
||||
for d in range(10, 30):
|
||||
print '2/%d' % d
|
||||
news = getNews(2008, 2, d)
|
||||
for new in news:
|
||||
strings = new['url'].split('/')
|
||||
string = formatSection(strings[3])
|
||||
if len(strings) == 6:
|
||||
string += '/' + formatSubsection(strings[4])
|
||||
if not string in x:
|
||||
x.append(string)
|
||||
print x
|
||||
'''
|
||||
archiveNews()
|
Loading…
Reference in a new issue