spiegel.py: get actual pages

This commit is contained in:
Rolux 2008-05-01 11:14:10 +02:00
parent 1b04735c68
commit dd048aa4f2

View file

@ -88,7 +88,7 @@ def formatSubsection(string):
return string[:1].upper() + string[1:] return string[:1].upper() + string[1:]
def getMagazine(year, week): def getMagazine(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (year, week, year, week) coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not oxutils.net.exists(coverUrl): if not oxutils.net.exists(coverUrl):
return None return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week) url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
@ -99,7 +99,15 @@ def getMagazine(year, week):
page = int(re.compile('&SE=(.*?)"').findall(item)[0]) page = int(re.compile('&SE=(.*?)"').findall(item)[0])
title = stripTags(item).strip() title = stripTags(item).strip()
contents.append({'title': title, 'page': page}) contents.append({'title': title, 'page': page})
return {'contents': contents, 'coverUrl': coverUrl} pageUrl = {}
pages = page + 2
for page in range(1, pages + 10):
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
if oxutils.net.exists(url):
pageUrl[page] = url
else:
pageUrl[page] = ''
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
def archiveMagazines(): def archiveMagazines():
@ -125,13 +133,13 @@ def archiveMagazines():
dirname = '%s/%d/%02d' % (archivePath, y, w) dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname): if not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
filename = '%s/Der Spiegel %d-%02d.json' % (dirname, y, w) filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
if not os.path.exists(filename) or True: if not os.path.exists(filename) or True:
data = simplejson.dumps(magazine, ensure_ascii = False) data = simplejson.dumps(magazine, ensure_ascii = False)
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
filename = '%s/Der Spiegel %d-%02d.txt' % (dirname, y, w) filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
if not os.path.exists(filename) or True: if not os.path.exists(filename) or True:
data = [] data = []
for item in magazine['contents']: for item in magazine['contents']:
@ -140,12 +148,21 @@ def archiveMagazines():
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
filename = '%s/Der Spiegel %d-%02d.jpg' % (dirname, y, w) filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename): if not os.path.exists(filename):
data = oxutils.cache.getUrl(magazine['coverUrl']) data = oxutils.cache.getUrl(magazine['coverUrl'])
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
for page in magazine['pageUrl']:
url = magazine['pageUrl'][page]
if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename):
data = oxutils.cache.getUrl(url)
f = open(filename, 'w')
f.write(data)
f.close()
def archiveNews(): def archiveNews():