spiegel.py: cleanup

This commit is contained in:
Rolux 2008-05-01 01:27:50 +02:00
parent a360bba9b5
commit 1b04735c68
1 changed files with 119 additions and 51 deletions

View File

@ -1,16 +1,13 @@
from datetime import datetime
import re
import time
from BeautifulSoup import BeautifulSoup
from oxutils.cache import getUrl
from oxutils.html import stripTags
import oxutils.cache
from oxutils.html import decodeHtml, stripTags
import oxutils.net
def output(news):
for new in news:
print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], ''])
def getNews(year, month, day):
sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
@ -22,8 +19,10 @@ def getNews(year, month, day):
news = []
for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
print url
html = getUrl(url)
if date == time.strftime('%d.%m.%Y', time.localtime()):
html = oxutils.net.getUrl(url)
else:
html = oxutils.cache.getUrl(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try:
@ -34,31 +33,40 @@ def getNews(year, month, day):
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
except:
imageUrl = ''
if dateString[:10] == date and description and imageUrl:
# print item
try:
title = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
except:
title = ''
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
new = {}
if len(dateString) == 10:
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
new['dateString'] = dateString
# fix decodeHtml
# new['description'] = formatString(decodeHtml(description))
new['description'] = formatString(description)
new['imageUrl'] = imageUrl
new['section'] = formatSection(section)
new['title'] = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
new['title0'] = re.compile('(.*?): ').findall(new['title'])[0]
new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0]
new['title'] = formatString(title)
new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
news.append(new)
print dateString + ' - ok'
elif not description:
elif dateString[:10] == date and not description:
print dateString + ' - no description'
elif not imageUrl:
elif dateString[:10] == date and not imageUrl:
print dateString + ' - no image'
return news
def splitTitle(title):
title0 = re.compile('(.*?): ').findall(title)[0]
title1 = re.compile(': (.*?)$').findall(title)[0]
return [title0, title1]
def formatString(string):
return string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
return string
def formatSection(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
@ -79,26 +87,66 @@ def formatSubsection(string):
return subsection[string]
return string[:1].upper() + string[1:]
def getMagazine(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (year, week, year, week)
if not oxutils.net.exists(coverUrl):
return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = []
soup = BeautifulSoup(oxutils.cache.getUrl(url))
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
item = str(item)
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
title = stripTags(item).strip()
contents.append({'title': title, 'page': page})
return {'contents': contents, 'coverUrl': coverUrl}
class Spiegel:
def __init__(self, year, week):
self.year = year
self.week = week
self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)
if not oxutils.net.exists(self.coverUrl):
self.coverUrl = ''
return
self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)
def getContents(self):
self.contents = []
soup = BeautifulSoup(getUrl(self.contentsUrl))
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
item = str(item)
title = stripTags(item).strip()
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
self.contents.append({'title': title, 'page': page})
return self.contents
def archiveMagazines():
'''
this is just an example of an archiving application
'''
import os
import simplejson
import time
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
week = int(time.strftime('%W', localtime))
for y in range(year, 1993, -1):
if y == year:
wMax = week + 1
else:
wMax = 53
for w in range(wMax, 0, -1):
print '%2d/%d' % (w, y)
magazine = getMagazine(y, w)
if magazine:
dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = '%s/Der Spiegel %d-%02d.json' % (dirname, y, w)
if not os.path.exists(filename) or True:
data = simplejson.dumps(magazine, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d-%02d.txt' % (dirname, y, w)
if not os.path.exists(filename) or True:
data = []
for item in magazine['contents']:
data.append('%3d %s' % (item['page'], item['title']))
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d-%02d.jpg' % (dirname, y, w)
if not os.path.exists(filename):
data = oxutils.cache.getUrl(magazine['coverUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
def archiveNews():
'''
@ -106,30 +154,49 @@ def archiveNews():
'''
import os
import simplejson
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de'
for y in range(2007, 2008):
for m in range(1, 13):
for d in range(1, 32):
import time
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
days = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
month = int(time.strftime('%m', localtime))
day = int(time.strftime('%d', localtime))
for y in range(year, 1999, -1):
if y == year:
mMax = month
else:
mMax = m
for m in range(mMax, 0, -1):
if y == year and m == month:
dMax = day
else:
dMax = days[m]
for d in range(dMax, 0, -1):
news = getNews(y, m ,d)
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
data = simplejson.dumps(new, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
if not os.path.exists(filename) or True:
data = simplejson.dumps(new, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = filename[:-4] + 'txt'
data = '\n'.join([new['title0'], new['title1'], new['description']])
f = open(filename, 'w')
f.write(data)
f.close()
if not os.path.exists(filename) or True:
data = splitTitle(new['title'])
data.append(new['description'])
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
data = getUrl(new['imageUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
if not os.path.exists(filename):
data = oxutils.cache.getUrl(new['imageUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
if __name__ == '__main__':
@ -151,4 +218,5 @@ if __name__ == '__main__':
x.append(string)
print x
'''
archiveMagazines()
archiveNews()