merge spiegel.py: get actual pages

This commit is contained in:
j 2008-05-04 17:07:40 +02:00
commit a065f0650e

View file

@ -1,42 +1,239 @@
from datetime import datetime
import re import re
from time import gmtime, strftime import time
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
from oxutils.cache import getUrl import oxutils.cache
from oxutils.html import stripTags from oxutils.html import decodeHtml, stripTags
import oxutils.net
class Spiegel: def getNews(year, month, day):
def __init__(self, year, week): sections = [
# fixme: simply check if cover exists 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
thisYear = int(strftime('%Y', gmtime())) 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
thisWeek = int(strftime('%W', gmtime())) ]
years = range(1994, thisYear + 1) dt = datetime(year, month, day)
if year == thisYear: day = int(dt.strftime('%j'))
weeks = range(1, thisWeek + 2) date = dt.strftime('%d.%m.%Y')
elif year in [1998, 2004]: news = []
weeks = range(1, 54) for section in sections:
else: url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
weeks = range(1, 53) if date == time.strftime('%d.%m.%Y', time.localtime()):
if year not in years or week not in weeks: html = oxutils.net.getUrl(url)
return None else:
# end fixme html = oxutils.cache.getUrl(url)
self.year = year for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
self.week = week dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week) try:
self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week) description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip()
except:
description = ''
try:
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
except:
imageUrl = ''
try:
title = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
except:
title = ''
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
new = {}
if len(dateString) == 10:
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
# fix decodeHtml
# new['description'] = formatString(decodeHtml(description))
new['description'] = formatString(description)
new['imageUrl'] = imageUrl
new['section'] = formatSection(section)
new['title'] = formatString(title)
new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
news.append(new)
print dateString + ' - ok'
elif dateString[:10] == date and not description:
print dateString + ' - no description'
elif dateString[:10] == date and not imageUrl:
print dateString + ' - no image'
return news
def getContents(self): def splitTitle(title):
self.contents = [] title0 = re.compile('(.*?): ').findall(title)[0]
soup = BeautifulSoup(getUrl(self.contentsUrl)) title1 = re.compile(': (.*?)$').findall(title)[0]
return [title0, title1]
def formatString(string):
string = string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
return string
def formatSection(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
def formatSubsection(string):
subsection = {
'abi': 'Abi - und dann?',
'formel1': 'Formel 1',
'jobundberuf': 'Job & Beruf',
'leben': 'Leben U21',
'mensch': 'Mensch & Technik',
'sonst': '',
'staedte': u'St\xc3dte',
'ussports': 'US-Sports',
'wunderbar': 'wunderBAR'
}
if subsection.has_key(string):
return subsection[string]
return string[:1].upper() + string[1:]
def getMagazine(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not oxutils.net.exists(coverUrl):
return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = []
soup = BeautifulSoup(oxutils.cache.getUrl(url))
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}): for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
item = str(item) item = str(item)
title = stripTags(item).strip() page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
page = re.compile('&amp;SE=(.*?)"').findall(item)[0] title = stripTags(item).strip()
self.contents.append({'title': title, 'page': page}) contents.append({'title': title, 'page': page})
return self.contents pageUrl = {}
pages = page + 2
for page in range(1, pages + 10):
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
if oxutils.net.exists(url):
pageUrl[page] = url
else:
pageUrl[page] = ''
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
def archiveMagazines():
'''
this is just an example of an archiving application
'''
import os
import simplejson
import time
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
week = int(time.strftime('%W', localtime))
for y in range(year, 1993, -1):
if y == year:
wMax = week + 1
else:
wMax = 53
for w in range(wMax, 0, -1):
print '%2d/%d' % (w, y)
magazine = getMagazine(y, w)
if magazine:
dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
if not os.path.exists(filename) or True:
data = simplejson.dumps(magazine, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
if not os.path.exists(filename) or True:
data = []
for item in magazine['contents']:
data.append('%3d %s' % (item['page'], item['title']))
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename):
data = oxutils.cache.getUrl(magazine['coverUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
for page in magazine['pageUrl']:
url = magazine['pageUrl'][page]
if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename):
data = oxutils.cache.getUrl(url)
f = open(filename, 'w')
f.write(data)
f.close()
def archiveNews():
'''
this is just an example of an archiving application
'''
import os
import simplejson
import time
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
days = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
month = int(time.strftime('%m', localtime))
day = int(time.strftime('%d', localtime))
for y in range(year, 1999, -1):
if y == year:
mMax = month
else:
mMax = m
for m in range(mMax, 0, -1):
if y == year and m == month:
dMax = day
else:
dMax = days[m]
for d in range(dMax, 0, -1):
news = getNews(y, m ,d)
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
if not os.path.exists(filename) or True:
data = simplejson.dumps(new, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = filename[:-4] + 'txt'
if not os.path.exists(filename) or True:
data = splitTitle(new['title'])
data.append(new['description'])
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
if not os.path.exists(filename):
data = oxutils.cache.getUrl(new['imageUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
if __name__ == '__main__': if __name__ == '__main__':
spiegel = Spiegel(2008, 8) # spiegel = Spiegel(2008, 8)
spiegel.getContents() # print spiegel.getContents()
print spiegel.contents # news = News(2001, 9, 10)
# output(news.getNews())
'''
x = []
for d in range(10, 30):
print '2/%d' % d
news = getNews(2008, 2, d)
for new in news:
strings = new['url'].split('/')
string = formatSection(strings[3])
if len(strings) == 6:
string += '/' + formatSubsection(strings[4])
if not string in x:
x.append(string)
print x
'''
archiveMagazines()
archiveNews()