python-ox/ox/web/spiegel.py

289 lines
12 KiB
Python
Raw Permalink Normal View History

2010-07-07 23:25:57 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
2014-09-30 19:27:26 +00:00
from __future__ import print_function
2010-07-07 23:25:57 +00:00
from datetime import datetime
import re
import time
import ox.cache
from ox.html import decode_html, strip_tags
2010-07-07 23:25:57 +00:00
import ox.net
2012-08-15 15:15:40 +00:00
def get_news(year, month, day):
2010-07-07 23:25:57 +00:00
sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
]
dt = datetime(year, month, day)
day = int(dt.strftime('%j'))
date = dt.strftime('%d.%m.%Y')
news = []
for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
if date == time.strftime('%d.%m.%Y', time.localtime()):
html = ox.net.read_url(url)
2010-07-07 23:25:57 +00:00
else:
html = ox.cache.read_url(url)
2010-07-07 23:25:57 +00:00
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
2010-07-07 23:25:57 +00:00
try:
2012-08-15 15:15:40 +00:00
description = format_string(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
2010-07-07 23:25:57 +00:00
except:
description = ''
try:
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
except:
imageUrl = ''
try:
2012-08-15 15:15:40 +00:00
title = format_string(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
2010-07-07 23:25:57 +00:00
except:
title = ''
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
new = {}
if len(dateString) == 10:
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
# fix decode_html
2012-08-15 15:15:40 +00:00
# new['description'] = format_string(decode_html(description))
new['description'] = format_string(description)
2010-07-07 23:25:57 +00:00
new['imageUrl'] = imageUrl
2012-08-15 15:15:40 +00:00
new['section'] = format_section(section)
new['title'] = format_string(title)
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(format_string(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
2010-07-07 23:25:57 +00:00
if new['title1'][-1:] == ':':
new['title1'] = new['title1'][0:-1]
new['title2'] = new['title'][len(new['title1']) + 2:]
new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
if new['url'][:1] == '/':
new['url'] = 'http://www.spiegel.de' + new['url']
news.append(new)
2014-09-30 19:27:26 +00:00
# print('%s, %s' % (new['section'], dateString))
2010-07-07 23:25:57 +00:00
'''
elif dateString[:10] == date and not description:
2014-09-30 19:27:26 +00:00
print(dateString + ' - no description')
2010-07-07 23:25:57 +00:00
elif dateString[:10] == date and not imageUrl:
2014-09-30 19:27:26 +00:00
print(dateString + ' - no image')
2010-07-07 23:25:57 +00:00
'''
return news
2012-08-15 15:15:40 +00:00
def split_title(title):
2010-07-07 23:25:57 +00:00
title1 = re.compile('(.*?): ').findall(title)[0]
title2 = re.compile(': (.*?)$').findall(title)[0]
return [title1, title2]
2012-08-15 15:15:40 +00:00
def format_string(string):
2010-07-07 23:25:57 +00:00
string = string.replace('<span class="spOptiBreak"> </span>', '')
string = string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
return string
2012-08-15 15:15:40 +00:00
def format_section(string):
2010-07-07 23:25:57 +00:00
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
2012-08-15 15:15:40 +00:00
def format_subsection(string):
2010-07-07 23:25:57 +00:00
# SPIEGEL, SPIEGEL special
subsection = {
'abi': 'Abi - und dann?',
'formel1': 'Formel 1',
'jobundberuf': 'Job & Beruf',
'leben': 'Leben U21',
'mensch': 'Mensch & Technik',
'sonst': '',
'staedte': u'St\xc3dte',
'ussports': 'US-Sports',
'wunderbar': 'wunderBAR'
}
2016-06-08 13:32:46 +00:00
if string in subsection:
2010-07-07 23:25:57 +00:00
return subsection[string].replace(u'\xc3', 'ae')
return string[:1].upper() + string[1:]
2012-08-15 15:15:40 +00:00
def get_issue(year, week):
2010-07-07 23:25:57 +00:00
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not ox.net.exists(coverUrl):
return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = []
data = ox.cache.read_url(url)
2024-09-11 21:52:01 +00:00
items = re.compile(r'<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
2010-07-07 23:25:57 +00:00
for item in items:
item = item[1]
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
title = strip_tags(item).strip()
2010-07-07 23:25:57 +00:00
contents.append({'title': title, 'page': page})
pageUrl = {}
pages = page + 2
for page in range(1, pages + 10):
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
if ox.cache.exists(url):
pageUrl[page] = url
else:
pageUrl[page] = ''
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
2012-08-15 15:15:40 +00:00
def archive_issues():
2010-07-07 23:25:57 +00:00
'''
this is just an example of an archiving application
'''
p = {}
import os
from ox.utils import json
2010-07-07 23:25:57 +00:00
import time
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
week = int(time.strftime('%W', localtime))
for y in range(year, 1993, -1):
if y == year:
wMax = week + 1
else:
wMax = 53
for w in range(wMax, 0, -1):
2014-09-30 19:27:26 +00:00
print('get_issue(%d, %d)' % (y, w))
2012-08-15 15:15:40 +00:00
issue = get_issue(y, w)
2010-07-07 23:25:57 +00:00
if issue:
dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
if not os.path.exists(filename):
data = json.dumps(issue, ensure_ascii = False)
2010-07-07 23:25:57 +00:00
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
if not os.path.exists(filename):
data = []
for item in issue['contents']:
data.append('%3d %s' % (item['page'], item['title']))
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename):
data = ox.cache.read_url(issue['coverUrl'])
2010-07-07 23:25:57 +00:00
f = open(filename, 'w')
f.write(data)
f.close()
for page in issue['pageUrl']:
url = issue['pageUrl'][page]
if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename):
data = ox.cache.read_url(url)
2010-07-07 23:25:57 +00:00
f = open(filename, 'w')
f.write(data)
f.close()
if not p:
p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
else:
p['num'] += 1
p['sum'] += issue['pages']
if issue['pages'] < p['min']:
p['min'] = issue['pages']
if issue['pages'] > p['max']:
p['max'] = issue['pages']
2014-09-30 19:27:26 +00:00
print(p['min'], p['sum'] / p['num'], p['max'])
2010-07-07 23:25:57 +00:00
2012-08-15 15:15:40 +00:00
def archive_news():
2010-07-07 23:25:57 +00:00
'''
this is just an example of an archiving application
'''
import os
from ox.utils import json
2010-07-07 23:25:57 +00:00
import time
count = {}
colon = []
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
month = int(time.strftime('%m', localtime))
day = int(time.strftime('%d', localtime)) - 1
for y in range(year, 1999, -1):
if y == year:
mMax = month
else:
mMax = 12
for m in range(mMax, 0, -1):
if y == year and m == month:
dMax = day
elif m == 2 and y % 4 == 0 and y % 400 != 0:
dMax = days[m] + 1
else:
dMax = days[m]
for d in range(dMax, 0, -1):
2016-06-08 13:32:46 +00:00
print('get_news(%d, %d, %d)' % (y, m, d))
news = get_news(y, m, d)
2010-07-07 23:25:57 +00:00
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
os.makedirs(dirname)
if new['url'][-5:] == '.html':
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
else:
filename = dirname + '/' + new['url'] + '.json'
if not os.path.exists(filename) or True:
2016-06-08 13:32:46 +00:00
data = json.dumps(new, ensure_ascii=False)
2010-07-07 23:25:57 +00:00
f = open(filename, 'w')
f.write(data)
f.close()
filename = filename[:-5] + '.txt'
if not os.path.exists(filename) or True:
2012-08-15 15:15:40 +00:00
data = split_title(new['title'])
2010-07-07 23:25:57 +00:00
data.append(new['description'])
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
if not os.path.exists(filename):
data = ox.cache.read_url(new['imageUrl'])
2010-07-07 23:25:57 +00:00
f = open(filename, 'w')
f.write(data)
f.close()
strings = new['url'].split('/')
string = strings[3]
if len(strings) == 6:
string += '/' + strings[4]
2016-06-08 13:32:46 +00:00
if string not in count:
2010-07-07 23:25:57 +00:00
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
else:
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
2012-08-15 15:15:40 +00:00
strings = split_title(new['title'])
2010-07-07 23:25:57 +00:00
if strings[0] != new['title1'] or strings[1] != new['title2']:
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
2012-08-15 15:15:40 +00:00
for key in sorted(count):
2014-09-30 19:27:26 +00:00
print('%6d %-24s %s' % (count[key]['count'], key, count[key]['string']))
2010-07-07 23:25:57 +00:00
for value in colon:
2014-09-30 19:27:26 +00:00
print(value)
2010-07-07 23:25:57 +00:00
if __name__ == '__main__':
# spiegel = Spiegel(2008, 8)
2014-09-30 19:27:26 +00:00
# print(spiegel.getContents())
2010-07-07 23:25:57 +00:00
# news = News(2001, 9, 10)
2016-06-08 13:32:46 +00:00
# output(news.get_news())
2010-07-07 23:25:57 +00:00
'''
x = []
for d in range(10, 30):
2014-09-30 19:27:26 +00:00
print('2/%d' % d)
2016-06-08 13:32:46 +00:00
news = get_news(2008, 2, d)
2010-07-07 23:25:57 +00:00
for new in news:
strings = new['url'].split('/')
2012-08-15 15:15:40 +00:00
string = format_section(strings[3])
2010-07-07 23:25:57 +00:00
if len(strings) == 6:
2012-08-15 15:15:40 +00:00
string += '/' + format_subsection(strings[4])
2010-07-07 23:25:57 +00:00
if not string in x:
x.append(string)
2014-09-30 19:27:26 +00:00
print(x)
2010-07-07 23:25:57 +00:00
'''
2012-08-15 15:15:40 +00:00
# archive_issues()
archive_news()