spiegel.py: cleanup
This commit is contained in:
parent
a360bba9b5
commit
1b04735c68
1 changed files with 119 additions and 51 deletions
170
ox/spiegel.py
170
ox/spiegel.py
|
@ -1,16 +1,13 @@
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
from oxutils.cache import getUrl
|
import oxutils.cache
|
||||||
from oxutils.html import stripTags
|
from oxutils.html import decodeHtml, stripTags
|
||||||
import oxutils.net
|
import oxutils.net
|
||||||
|
|
||||||
def output(news):
|
|
||||||
for new in news:
|
|
||||||
print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], ''])
|
|
||||||
|
|
||||||
def getNews(year, month, day):
|
def getNews(year, month, day):
|
||||||
sections = [
|
sections = [
|
||||||
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
||||||
|
@ -22,8 +19,10 @@ def getNews(year, month, day):
|
||||||
news = []
|
news = []
|
||||||
for section in sections:
|
for section in sections:
|
||||||
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
|
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
|
||||||
print url
|
if date == time.strftime('%d.%m.%Y', time.localtime()):
|
||||||
html = getUrl(url)
|
html = oxutils.net.getUrl(url)
|
||||||
|
else:
|
||||||
|
html = oxutils.cache.getUrl(url)
|
||||||
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||||
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||||
try:
|
try:
|
||||||
|
@ -34,31 +33,40 @@ def getNews(year, month, day):
|
||||||
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
|
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
|
||||||
except:
|
except:
|
||||||
imageUrl = ''
|
imageUrl = ''
|
||||||
if dateString[:10] == date and description and imageUrl:
|
try:
|
||||||
# print item
|
title = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
|
||||||
|
except:
|
||||||
|
title = ''
|
||||||
|
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
|
||||||
new = {}
|
new = {}
|
||||||
if len(dateString) == 10:
|
if len(dateString) == 10:
|
||||||
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
||||||
else:
|
else:
|
||||||
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||||
new['dateString'] = dateString
|
# fix decodeHtml
|
||||||
|
# new['description'] = formatString(decodeHtml(description))
|
||||||
new['description'] = formatString(description)
|
new['description'] = formatString(description)
|
||||||
new['imageUrl'] = imageUrl
|
new['imageUrl'] = imageUrl
|
||||||
new['section'] = formatSection(section)
|
new['section'] = formatSection(section)
|
||||||
new['title'] = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
|
new['title'] = formatString(title)
|
||||||
new['title0'] = re.compile('(.*?): ').findall(new['title'])[0]
|
|
||||||
new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0]
|
|
||||||
new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
|
new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
|
||||||
news.append(new)
|
news.append(new)
|
||||||
print dateString + ' - ok'
|
print dateString + ' - ok'
|
||||||
elif not description:
|
elif dateString[:10] == date and not description:
|
||||||
print dateString + ' - no description'
|
print dateString + ' - no description'
|
||||||
elif not imageUrl:
|
elif dateString[:10] == date and not imageUrl:
|
||||||
print dateString + ' - no image'
|
print dateString + ' - no image'
|
||||||
return news
|
return news
|
||||||
|
|
||||||
|
def splitTitle(title):
|
||||||
|
title0 = re.compile('(.*?): ').findall(title)[0]
|
||||||
|
title1 = re.compile(': (.*?)$').findall(title)[0]
|
||||||
|
return [title0, title1]
|
||||||
|
|
||||||
def formatString(string):
|
def formatString(string):
|
||||||
return string.replace('\n', ' ').replace(' ', ' ').strip()
|
string = string.replace('\n', ' ').replace(' ', ' ').strip()
|
||||||
|
string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
|
||||||
|
return string
|
||||||
|
|
||||||
def formatSection(string):
|
def formatSection(string):
|
||||||
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
||||||
|
@ -79,26 +87,66 @@ def formatSubsection(string):
|
||||||
return subsection[string]
|
return subsection[string]
|
||||||
return string[:1].upper() + string[1:]
|
return string[:1].upper() + string[1:]
|
||||||
|
|
||||||
|
def getMagazine(year, week):
|
||||||
|
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (year, week, year, week)
|
||||||
|
if not oxutils.net.exists(coverUrl):
|
||||||
|
return None
|
||||||
|
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
|
||||||
|
contents = []
|
||||||
|
soup = BeautifulSoup(oxutils.cache.getUrl(url))
|
||||||
|
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
|
||||||
|
item = str(item)
|
||||||
|
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
||||||
|
title = stripTags(item).strip()
|
||||||
|
contents.append({'title': title, 'page': page})
|
||||||
|
return {'contents': contents, 'coverUrl': coverUrl}
|
||||||
|
|
||||||
class Spiegel:
|
|
||||||
def __init__(self, year, week):
|
|
||||||
self.year = year
|
|
||||||
self.week = week
|
|
||||||
self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)
|
|
||||||
if not oxutils.net.exists(self.coverUrl):
|
|
||||||
self.coverUrl = ''
|
|
||||||
return
|
|
||||||
self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)
|
|
||||||
|
|
||||||
def getContents(self):
|
def archiveMagazines():
|
||||||
self.contents = []
|
'''
|
||||||
soup = BeautifulSoup(getUrl(self.contentsUrl))
|
this is just an example of an archiving application
|
||||||
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
|
'''
|
||||||
item = str(item)
|
import os
|
||||||
title = stripTags(item).strip()
|
import simplejson
|
||||||
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
import time
|
||||||
self.contents.append({'title': title, 'page': page})
|
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
|
||||||
return self.contents
|
localtime = time.localtime()
|
||||||
|
year = int(time.strftime('%Y', localtime))
|
||||||
|
week = int(time.strftime('%W', localtime))
|
||||||
|
for y in range(year, 1993, -1):
|
||||||
|
if y == year:
|
||||||
|
wMax = week + 1
|
||||||
|
else:
|
||||||
|
wMax = 53
|
||||||
|
for w in range(wMax, 0, -1):
|
||||||
|
print '%2d/%d' % (w, y)
|
||||||
|
magazine = getMagazine(y, w)
|
||||||
|
if magazine:
|
||||||
|
dirname = '%s/%d/%02d' % (archivePath, y, w)
|
||||||
|
if not os.path.exists(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
|
filename = '%s/Der Spiegel %d-%02d.json' % (dirname, y, w)
|
||||||
|
if not os.path.exists(filename) or True:
|
||||||
|
data = simplejson.dumps(magazine, ensure_ascii = False)
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
filename = '%s/Der Spiegel %d-%02d.txt' % (dirname, y, w)
|
||||||
|
if not os.path.exists(filename) or True:
|
||||||
|
data = []
|
||||||
|
for item in magazine['contents']:
|
||||||
|
data.append('%3d %s' % (item['page'], item['title']))
|
||||||
|
data = '\n'.join(data)
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
filename = '%s/Der Spiegel %d-%02d.jpg' % (dirname, y, w)
|
||||||
|
if not os.path.exists(filename):
|
||||||
|
data = oxutils.cache.getUrl(magazine['coverUrl'])
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
def archiveNews():
|
def archiveNews():
|
||||||
'''
|
'''
|
||||||
|
@ -106,30 +154,49 @@ def archiveNews():
|
||||||
'''
|
'''
|
||||||
import os
|
import os
|
||||||
import simplejson
|
import simplejson
|
||||||
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de'
|
import time
|
||||||
for y in range(2007, 2008):
|
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
|
||||||
for m in range(1, 13):
|
days = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
|
||||||
for d in range(1, 32):
|
localtime = time.localtime()
|
||||||
|
year = int(time.strftime('%Y', localtime))
|
||||||
|
month = int(time.strftime('%m', localtime))
|
||||||
|
day = int(time.strftime('%d', localtime))
|
||||||
|
for y in range(year, 1999, -1):
|
||||||
|
if y == year:
|
||||||
|
mMax = month
|
||||||
|
else:
|
||||||
|
mMax = m
|
||||||
|
for m in range(mMax, 0, -1):
|
||||||
|
if y == year and m == month:
|
||||||
|
dMax = day
|
||||||
|
else:
|
||||||
|
dMax = days[m]
|
||||||
|
for d in range(dMax, 0, -1):
|
||||||
news = getNews(y, m ,d)
|
news = getNews(y, m ,d)
|
||||||
for new in news:
|
for new in news:
|
||||||
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
||||||
if not os.path.exists(dirname):
|
if not os.path.exists(dirname):
|
||||||
os.makedirs(dirname)
|
os.makedirs(dirname)
|
||||||
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
|
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
|
||||||
data = simplejson.dumps(new, ensure_ascii = False)
|
if not os.path.exists(filename) or True:
|
||||||
f = open(filename, 'w')
|
data = simplejson.dumps(new, ensure_ascii = False)
|
||||||
f.write(data)
|
f = open(filename, 'w')
|
||||||
f.close()
|
f.write(data)
|
||||||
|
f.close()
|
||||||
filename = filename[:-4] + 'txt'
|
filename = filename[:-4] + 'txt'
|
||||||
data = '\n'.join([new['title0'], new['title1'], new['description']])
|
if not os.path.exists(filename) or True:
|
||||||
f = open(filename, 'w')
|
data = splitTitle(new['title'])
|
||||||
f.write(data)
|
data.append(new['description'])
|
||||||
f.close()
|
data = '\n'.join(data)
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
|
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
|
||||||
data = getUrl(new['imageUrl'])
|
if not os.path.exists(filename):
|
||||||
f = open(filename, 'w')
|
data = oxutils.cache.getUrl(new['imageUrl'])
|
||||||
f.write(data)
|
f = open(filename, 'w')
|
||||||
f.close()
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -151,4 +218,5 @@ if __name__ == '__main__':
|
||||||
x.append(string)
|
x.append(string)
|
||||||
print x
|
print x
|
||||||
'''
|
'''
|
||||||
|
archiveMagazines()
|
||||||
archiveNews()
|
archiveNews()
|
Loading…
Reference in a new issue