
This commit is contained in:
Rolux 2008-04-30 20:51:27 +02:00
parent 28d84dd3eb
commit a360bba9b5

View file

@ -1,29 +1,93 @@
from datetime import datetime
import re import re
from time import gmtime, strftime
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
from oxutils.cache import getUrl from oxutils.cache import getUrl
from oxutils.html import stripTags from oxutils.html import stripTags
def output(news):
for new in news:
print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], ''])
def getNews(year, month, day):
sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
dt = datetime(year, month, day)
day = int(dt.strftime('%j'))
date = dt.strftime('%d.%m.%Y')
news = []
for section in sections:
url = ',1518,archiv-%d-%03d,00.html' % (section, year, day)
print url
html = getUrl(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip()
description = ''
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
imageUrl = ''
if dateString[:10] == date and description and imageUrl:
# print item
new = {}
if len(dateString) == 10:
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
new['dateString'] = dateString
new['description'] = formatString(description)
new['imageUrl'] = imageUrl
new['section'] = formatSection(section)
new['title'] = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
new['title0'] = re.compile('(.*?): ').findall(new['title'])[0]
new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0]
new['url'] = '' + re.compile('<a href="(.*?)"').findall(item)[0]
print dateString + ' - ok'
elif not description:
print dateString + ' - no description'
elif not imageUrl:
print dateString + ' - no image'
return news
def formatString(string):
return string.replace('\n', ' ').replace(' ', ' ').strip()
def formatSection(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
def formatSubsection(string):
subsection = {
'abi': 'Abi - und dann?',
'formel1': 'Formel 1',
'jobundberuf': 'Job & Beruf',
'leben': 'Leben U21',
'mensch': 'Mensch & Technik',
'sonst': '',
'staedte': u'St\xc3dte',
'ussports': 'US-Sports',
'wunderbar': 'wunderBAR'
if subsection.has_key(string):
return subsection[string]
return string[:1].upper() + string[1:]
class Spiegel: class Spiegel:
def __init__(self, year, week): def __init__(self, year, week):
# fixme: simply check if cover exists
thisYear = int(strftime('%Y', gmtime()))
thisWeek = int(strftime('%W', gmtime()))
years = range(1994, thisYear + 1)
if year == thisYear:
weeks = range(1, thisWeek + 2)
elif year in [1998, 2004]:
weeks = range(1, 54)
weeks = range(1, 53)
if year not in years or week not in weeks:
return None
# end fixme
self.year = year self.year = year
self.week = week self.week = week
self.coverUrl = '' % (self.year, self.week, self.year, self.week) self.coverUrl = '' % (self.year, self.week, self.year, self.week)
if not
self.coverUrl = ''
self.contentsUrl = '' % (self.year, self.week) self.contentsUrl = '' % (self.year, self.week)
def getContents(self): def getContents(self):
@ -32,11 +96,59 @@ class Spiegel:
for item in soup('a', {'href': re.compile('\?Q=SP&JG=')}): for item in soup('a', {'href': re.compile('\?Q=SP&JG=')}):
item = str(item) item = str(item)
title = stripTags(item).strip() title = stripTags(item).strip()
page = re.compile('&amp;SE=(.*?)"').findall(item)[0] page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
self.contents.append({'title': title, 'page': page}) self.contents.append({'title': title, 'page': page})
return self.contents return self.contents
def archiveNews():
this is just an example of an archiving application
import os
import simplejson
archivePath = '/Volumes/Rolux Home/Desktop/Data/'
for y in range(2007, 2008):
for m in range(1, 13):
for d in range(1, 32):
news = getNews(y, m ,d)
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
data = simplejson.dumps(new, ensure_ascii = False)
f = open(filename, 'w')
filename = filename[:-4] + 'txt'
data = '\n'.join([new['title0'], new['title1'], new['description']])
f = open(filename, 'w')
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
data = getUrl(new['imageUrl'])
f = open(filename, 'w')
if __name__ == '__main__': if __name__ == '__main__':
spiegel = Spiegel(2008, 8) # spiegel = Spiegel(2008, 8)
spiegel.getContents() # print spiegel.getContents()
print spiegel.contents # news = News(2001, 9, 10)
# output(news.getNews())
x = []
for d in range(10, 30):
print '2/%d' % d
news = getNews(2008, 2, d)
for new in news:
strings = new['url'].split('/')
string = formatSection(strings[3])
if len(strings) == 6:
string += '/' + formatSubsection(strings[4])
if not string in x:
print x