spiegel.py: cleanup

This commit is contained in:
Rolux 2008-05-01 01:27:50 +02:00
parent a360bba9b5
commit 1b04735c68

View File

@ -1,16 +1,13 @@
from datetime import datetime from datetime import datetime
import re import re
import time
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
from oxutils.cache import getUrl import oxutils.cache
from oxutils.html import stripTags from oxutils.html import decodeHtml, stripTags
import oxutils.net import oxutils.net
def output(news):
for new in news:
print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], ''])
def getNews(year, month, day): def getNews(year, month, day):
sections = [ sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
@ -22,8 +19,10 @@ def getNews(year, month, day):
news = [] news = []
for section in sections: for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
print url if date == time.strftime('%d.%m.%Y', time.localtime()):
html = getUrl(url) html = oxutils.net.getUrl(url)
else:
html = oxutils.cache.getUrl(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html): for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip() dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try: try:
@ -34,31 +33,40 @@ def getNews(year, month, day):
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0] imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
except: except:
imageUrl = '' imageUrl = ''
if dateString[:10] == date and description and imageUrl: try:
# print item title = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
except:
title = ''
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
new = {} new = {}
if len(dateString) == 10: if len(dateString) == 10:
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2]) new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else: else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17]) new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
new['dateString'] = dateString # fix decodeHtml
# new['description'] = formatString(decodeHtml(description))
new['description'] = formatString(description) new['description'] = formatString(description)
new['imageUrl'] = imageUrl new['imageUrl'] = imageUrl
new['section'] = formatSection(section) new['section'] = formatSection(section)
new['title'] = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0]) new['title'] = formatString(title)
new['title0'] = re.compile('(.*?): ').findall(new['title'])[0]
new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0]
new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0] new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
news.append(new) news.append(new)
print dateString + ' - ok' print dateString + ' - ok'
elif not description: elif dateString[:10] == date and not description:
print dateString + ' - no description' print dateString + ' - no description'
elif not imageUrl: elif dateString[:10] == date and not imageUrl:
print dateString + ' - no image' print dateString + ' - no image'
return news return news
def splitTitle(title):
title0 = re.compile('(.*?): ').findall(title)[0]
title1 = re.compile(': (.*?)$').findall(title)[0]
return [title0, title1]
def formatString(string): def formatString(string):
return string.replace('\n', ' ').replace(' ', ' ').strip() string = string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
return string
def formatSection(string): def formatSection(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL') return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
@ -79,26 +87,66 @@ def formatSubsection(string):
return subsection[string] return subsection[string]
return string[:1].upper() + string[1:] return string[:1].upper() + string[1:]
def getMagazine(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (year, week, year, week)
if not oxutils.net.exists(coverUrl):
return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = []
soup = BeautifulSoup(oxutils.cache.getUrl(url))
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
item = str(item)
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
title = stripTags(item).strip()
contents.append({'title': title, 'page': page})
return {'contents': contents, 'coverUrl': coverUrl}
class Spiegel:
def __init__(self, year, week):
self.year = year
self.week = week
self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)
if not oxutils.net.exists(self.coverUrl):
self.coverUrl = ''
return
self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)
def getContents(self): def archiveMagazines():
self.contents = [] '''
soup = BeautifulSoup(getUrl(self.contentsUrl)) this is just an example of an archiving application
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}): '''
item = str(item) import os
title = stripTags(item).strip() import simplejson
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0]) import time
self.contents.append({'title': title, 'page': page}) archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
return self.contents localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
week = int(time.strftime('%W', localtime))
for y in range(year, 1993, -1):
if y == year:
wMax = week + 1
else:
wMax = 53
for w in range(wMax, 0, -1):
print '%2d/%d' % (w, y)
magazine = getMagazine(y, w)
if magazine:
dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = '%s/Der Spiegel %d-%02d.json' % (dirname, y, w)
if not os.path.exists(filename) or True:
data = simplejson.dumps(magazine, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d-%02d.txt' % (dirname, y, w)
if not os.path.exists(filename) or True:
data = []
for item in magazine['contents']:
data.append('%3d %s' % (item['page'], item['title']))
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d-%02d.jpg' % (dirname, y, w)
if not os.path.exists(filename):
data = oxutils.cache.getUrl(magazine['coverUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
def archiveNews(): def archiveNews():
''' '''
@ -106,30 +154,49 @@ def archiveNews():
''' '''
import os import os
import simplejson import simplejson
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de' import time
for y in range(2007, 2008): archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
for m in range(1, 13): days = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
for d in range(1, 32): localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
month = int(time.strftime('%m', localtime))
day = int(time.strftime('%d', localtime))
for y in range(year, 1999, -1):
if y == year:
mMax = month
else:
mMax = m
for m in range(mMax, 0, -1):
if y == year and m == month:
dMax = day
else:
dMax = days[m]
for d in range(dMax, 0, -1):
news = getNews(y, m ,d) news = getNews(y, m ,d)
for new in news: for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16] dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname): if not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json' filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
data = simplejson.dumps(new, ensure_ascii = False) if not os.path.exists(filename) or True:
f = open(filename, 'w') data = simplejson.dumps(new, ensure_ascii = False)
f.write(data) f = open(filename, 'w')
f.close() f.write(data)
f.close()
filename = filename[:-4] + 'txt' filename = filename[:-4] + 'txt'
data = '\n'.join([new['title0'], new['title1'], new['description']]) if not os.path.exists(filename) or True:
f = open(filename, 'w') data = splitTitle(new['title'])
f.write(data) data.append(new['description'])
f.close() data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1] filename = dirname + '/' + new['imageUrl'].split('/')[-1]
data = getUrl(new['imageUrl']) if not os.path.exists(filename):
f = open(filename, 'w') data = oxutils.cache.getUrl(new['imageUrl'])
f.write(data) f = open(filename, 'w')
f.close() f.write(data)
f.close()
if __name__ == '__main__': if __name__ == '__main__':
@ -151,4 +218,5 @@ if __name__ == '__main__':
x.append(string) x.append(string)
print x print x
''' '''
archiveMagazines()
archiveNews() archiveNews()