python-oxweb/ox/spiegel.py

from datetime import datetime
import re

from BeautifulSoup import BeautifulSoup

from oxutils.cache import getUrl
from oxutils.html import stripTags
import oxutils.net

def output(news):
    for new in news:
        print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], ''])

def getNews(year, month, day):
    sections = [
        'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
        'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
    ]
    dt = datetime(year, month, day)
    day = int(dt.strftime('%j'))
    date = dt.strftime('%d.%m.%Y')
    news = []
    for section in sections:
        url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
        print url
        html = getUrl(url)
        for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
            dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
            try:
                description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip()
            except:
                description = ''
            try:
                imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
            except:
                imageUrl = ''
            if dateString[:10] == date and description and imageUrl:
                # print item
                new = {}
                if len(dateString) == 10:
                    new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
                else:
                    new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
                new['dateString'] = dateString
                new['description'] = formatString(description)
                new['imageUrl'] = imageUrl
                new['section'] = formatSection(section)
                new['title'] = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
                new['title0'] = re.compile('(.*?): ').findall(new['title'])[0]
                new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0]
                new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
                news.append(new)
                print dateString + ' - ok'
            elif not description:
                print dateString + ' - no description'
            elif not imageUrl:
                print dateString + ' - no image'
    return news

def formatString(string):
    return string.replace('\n', ' ').replace('  ', ' ').strip()

def formatSection(string):
    return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')

def formatSubsection(string):
    subsection = {
        'abi': 'Abi - und dann?',
        'formel1': 'Formel 1',
        'jobundberuf': 'Job & Beruf',
        'leben': 'Leben U21',
        'mensch': 'Mensch & Technik',
        'sonst': '',
        'staedte': u'St\xc3dte',
        'ussports': 'US-Sports',
        'wunderbar': 'wunderBAR'
    }
    if subsection.has_key(string):
        return subsection[string]
    return string[:1].upper() + string[1:]
        

class Spiegel:
    def __init__(self, year, week):
        self.year = year
        self.week = week
        self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)
        if not oxutils.net.exists(self.coverUrl):
            self.coverUrl = ''
            return
        self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)

    def getContents(self):
        self.contents = []
        soup = BeautifulSoup(getUrl(self.contentsUrl))
        for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
            item = str(item)
            title = stripTags(item).strip()
            page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
            self.contents.append({'title': title, 'page': page})
        return self.contents

def archiveNews():
    '''
    this is just an example of an archiving application
    '''
    import os
    import simplejson
    archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de'
    for y in range(2007, 2008):
        for m in range(1, 13):
            for d in range(1, 32):
                news = getNews(y, m ,d)
                for new in news:
                    dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
                    if not os.path.exists(dirname):
                        os.makedirs(dirname)
                    filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
                    data = simplejson.dumps(new, ensure_ascii = False)
                    f = open(filename, 'w')
                    f.write(data)
                    f.close()
                    filename = filename[:-4] + 'txt'
                    data = '\n'.join([new['title0'], new['title1'], new['description']])
                    f = open(filename, 'w')
                    f.write(data)
                    f.close()
                    filename = dirname + '/' + new['imageUrl'].split('/')[-1]
                    data = getUrl(new['imageUrl'])
                    f = open(filename, 'w')
                    f.write(data)
                    f.close()


if __name__ == '__main__':
    # spiegel = Spiegel(2008, 8)
    # print spiegel.getContents()
    # news = News(2001, 9, 10)
    # output(news.getNews())
    '''
    x = []
    for d in range(10, 30):
        print '2/%d' % d
        news = getNews(2008, 2, d)
        for new in news:
            strings = new['url'].split('/')
            string = formatSection(strings[3])
            if len(strings) == 6:
                string += '/' + formatSubsection(strings[4])
            if not string in x:
                x.append(string)
        print x
    '''
    archiveNews()
spiegel.py news 2008-04-30 18:51:27 +00:00			`from datetime import datetime`
adding spiegel.py 2008-04-30 12:24:33 +00:00			`import re`

			`from BeautifulSoup import BeautifulSoup`

			`from oxutils.cache import getUrl`
			`from oxutils.html import stripTags`
spiegel.py news 2008-04-30 18:51:27 +00:00			`import oxutils.net`

			`def output(news):`
			`for new in news:`
			`print '\n'.join([new['section'] + ', ' + new['dateString'], new['title0'] + ': ' + new['title1'], new['description'], ''])`

			`def getNews(year, month, day):`
			`sections = [`
			`'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',`
			`'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'`
			`]`
			`dt = datetime(year, month, day)`
			`day = int(dt.strftime('%j'))`
			`date = dt.strftime('%d.%m.%Y')`
			`news = []`
			`for section in sections:`
			`url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)`
			`print url`
			`html = getUrl(url)`
			`for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):`
			`dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()`
			`try:`
			`description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip()`
			`except:`
			`description = ''`
			`try:`
			`imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]`
			`except:`
			`imageUrl = ''`
			`if dateString[:10] == date and description and imageUrl:`
			`# print item`
			`new = {}`
			`if len(dateString) == 10:`
			`new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])`
			`else:`
			`new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])`
			`new['dateString'] = dateString`
			`new['description'] = formatString(description)`
			`new['imageUrl'] = imageUrl`
			`new['section'] = formatSection(section)`
			`new['title'] = formatString(re.compile('title=[\'\|"](.*?)[\'\|"] />', re.DOTALL).findall(item)[0])`
			`new['title0'] = re.compile('(.*?): ').findall(new['title'])[0]`
			`new['title1'] = re.compile(': (.*?)$').findall(new['title'])[0]`
			`new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]`
			`news.append(new)`
			`print dateString + ' - ok'`
			`elif not description:`
			`print dateString + ' - no description'`
			`elif not imageUrl:`
			`print dateString + ' - no image'`
			`return news`

			`def formatString(string):`
			`return string.replace('\n', ' ').replace(' ', ' ').strip()`

			`def formatSection(string):`
			`return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')`

			`def formatSubsection(string):`
			`subsection = {`
			`'abi': 'Abi - und dann?',`
			`'formel1': 'Formel 1',`
			`'jobundberuf': 'Job & Beruf',`
			`'leben': 'Leben U21',`
			`'mensch': 'Mensch & Technik',`
			`'sonst': '',`
			`'staedte': u'St\xc3dte',`
			`'ussports': 'US-Sports',`
			`'wunderbar': 'wunderBAR'`
			`}`
			`if subsection.has_key(string):`
			`return subsection[string]`
			`return string[:1].upper() + string[1:]`

adding spiegel.py 2008-04-30 12:24:33 +00:00
			`class Spiegel:`
spiegel.py news 2008-04-30 18:51:27 +00:00			`def __init__(self, year, week):`
			`self.year = year`
			`self.week = week`
			`self.coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d0%02d0001-312.jpg' % (self.year, self.week, self.year, self.week)`
			`if not oxutils.net.exists(self.coverUrl):`
			`self.coverUrl = ''`
			`return`
			`self.contentsUrl = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (self.year, self.week)`

			`def getContents(self):`
			`self.contents = []`
			`soup = BeautifulSoup(getUrl(self.contentsUrl))`
			`for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):`
			`item = str(item)`
			`title = stripTags(item).strip()`
			`page = int(re.compile('&SE=(.*?)"').findall(item)[0])`
			`self.contents.append({'title': title, 'page': page})`
			`return self.contents`

			`def archiveNews():`
			`'''`
			`this is just an example of an archiving application`
			`'''`
			`import os`
			`import simplejson`
			`archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de'`
			`for y in range(2007, 2008):`
			`for m in range(1, 13):`
			`for d in range(1, 32):`
			`news = getNews(y, m ,d)`
			`for new in news:`
			`dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]`
			`if not os.path.exists(dirname):`
			`os.makedirs(dirname)`
			`filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'`
			`data = simplejson.dumps(new, ensure_ascii = False)`
			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`
			`filename = filename[:-4] + 'txt'`
			`data = '\n'.join([new['title0'], new['title1'], new['description']])`
			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`
			`filename = dirname + '/' + new['imageUrl'].split('/')[-1]`
			`data = getUrl(new['imageUrl'])`
			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`

adding spiegel.py 2008-04-30 12:24:33 +00:00
			`if __name__ == '__main__':`
spiegel.py news 2008-04-30 18:51:27 +00:00			`# spiegel = Spiegel(2008, 8)`
			`# print spiegel.getContents()`
			`# news = News(2001, 9, 10)`
			`# output(news.getNews())`
			`'''`
			`x = []`
			`for d in range(10, 30):`
			`print '2/%d' % d`
			`news = getNews(2008, 2, d)`
			`for new in news:`
			`strings = new['url'].split('/')`
			`string = formatSection(strings[3])`
			`if len(strings) == 6:`
			`string += '/' + formatSubsection(strings[4])`
			`if not string in x:`
			`x.append(string)`
			`print x`
			`'''`
			`archiveNews()`