merge spiegel.py: get actual pages

This commit is contained in:
j 2008-05-04 17:07:40 +02:00
commit a065f0650e

View file

@ -1,42 +1,239 @@
from datetime import datetime
import re
from time import gmtime, strftime
import time
from BeautifulSoup import BeautifulSoup
from oxutils.cache import getUrl
from oxutils.html import stripTags
import oxutils.cache
from oxutils.html import decodeHtml, stripTags
import oxutils.net
class Spiegel:
def __init__(self, year, week):
# fixme: simply check if cover exists
thisYear = int(strftime('%Y', gmtime()))
thisWeek = int(strftime('%W', gmtime()))
years = range(1994, thisYear + 1)
if year == thisYear:
weeks = range(1, thisWeek + 2)
elif year in [1998, 2004]:
weeks = range(1, 54)
else:
weeks = range(1, 53)
if year not in years or week not in weeks:
return None
# end fixme
self.year = year
self.week = week
self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)
self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)
def getNews(year, month, day):
sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
]
dt = datetime(year, month, day)
day = int(dt.strftime('%j'))
date = dt.strftime('%d.%m.%Y')
news = []
for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
if date == time.strftime('%d.%m.%Y', time.localtime()):
html = oxutils.net.getUrl(url)
else:
html = oxutils.cache.getUrl(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try:
description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip()
except:
description = ''
try:
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
except:
imageUrl = ''
try:
title = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
except:
title = ''
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
new = {}
if len(dateString) == 10:
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
# fix decodeHtml
# new['description'] = formatString(decodeHtml(description))
new['description'] = formatString(description)
new['imageUrl'] = imageUrl
new['section'] = formatSection(section)
new['title'] = formatString(title)
new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
news.append(new)
print dateString + ' - ok'
elif dateString[:10] == date and not description:
print dateString + ' - no description'
elif dateString[:10] == date and not imageUrl:
print dateString + ' - no image'
return news
def getContents(self):
self.contents = []
soup = BeautifulSoup(getUrl(self.contentsUrl))
def splitTitle(title):
title0 = re.compile('(.*?): ').findall(title)[0]
title1 = re.compile(': (.*?)$').findall(title)[0]
return [title0, title1]
def formatString(string):
string = string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
return string
def formatSection(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
def formatSubsection(string):
subsection = {
'abi': 'Abi - und dann?',
'formel1': 'Formel 1',
'jobundberuf': 'Job & Beruf',
'leben': 'Leben U21',
'mensch': 'Mensch & Technik',
'sonst': '',
'staedte': u'St\xc3dte',
'ussports': 'US-Sports',
'wunderbar': 'wunderBAR'
}
if subsection.has_key(string):
return subsection[string]
return string[:1].upper() + string[1:]
def getMagazine(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not oxutils.net.exists(coverUrl):
return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = []
soup = BeautifulSoup(oxutils.cache.getUrl(url))
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
item = str(item)
title = stripTags(item).strip()
page = re.compile('&amp;SE=(.*?)"').findall(item)[0]
self.contents.append({'title': title, 'page': page})
return self.contents
item = str(item)
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
title = stripTags(item).strip()
contents.append({'title': title, 'page': page})
pageUrl = {}
pages = page + 2
for page in range(1, pages + 10):
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
if oxutils.net.exists(url):
pageUrl[page] = url
else:
pageUrl[page] = ''
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
def archiveMagazines():
'''
this is just an example of an archiving application
'''
import os
import simplejson
import time
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
week = int(time.strftime('%W', localtime))
for y in range(year, 1993, -1):
if y == year:
wMax = week + 1
else:
wMax = 53
for w in range(wMax, 0, -1):
print '%2d/%d' % (w, y)
magazine = getMagazine(y, w)
if magazine:
dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
if not os.path.exists(filename) or True:
data = simplejson.dumps(magazine, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
if not os.path.exists(filename) or True:
data = []
for item in magazine['contents']:
data.append('%3d %s' % (item['page'], item['title']))
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename):
data = oxutils.cache.getUrl(magazine['coverUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
for page in magazine['pageUrl']:
url = magazine['pageUrl'][page]
if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename):
data = oxutils.cache.getUrl(url)
f = open(filename, 'w')
f.write(data)
f.close()
def archiveNews():
'''
this is just an example of an archiving application
'''
import os
import simplejson
import time
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
days = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
month = int(time.strftime('%m', localtime))
day = int(time.strftime('%d', localtime))
for y in range(year, 1999, -1):
if y == year:
mMax = month
else:
mMax = m
for m in range(mMax, 0, -1):
if y == year and m == month:
dMax = day
else:
dMax = days[m]
for d in range(dMax, 0, -1):
news = getNews(y, m ,d)
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
if not os.path.exists(filename) or True:
data = simplejson.dumps(new, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = filename[:-4] + 'txt'
if not os.path.exists(filename) or True:
data = splitTitle(new['title'])
data.append(new['description'])
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
if not os.path.exists(filename):
data = oxutils.cache.getUrl(new['imageUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
if __name__ == '__main__':
spiegel = Spiegel(2008, 8)
spiegel.getContents()
print spiegel.contents
# spiegel = Spiegel(2008, 8)
# print spiegel.getContents()
# news = News(2001, 9, 10)
# output(news.getNews())
'''
x = []
for d in range(10, 30):
print '2/%d' % d
news = getNews(2008, 2, d)
for new in news:
strings = new['url'].split('/')
string = formatSection(strings[3])
if len(strings) == 6:
string += '/' + formatSubsection(strings[4])
if not string in x:
x.append(string)
print x
'''
archiveMagazines()
archiveNews()