python-oxweb/oxweb/spiegel.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import time

from BeautifulSoup import BeautifulSoup

import oxlib.cache
from oxlib.html import decodeHtml, stripTags
import oxlib.net


def getNews(year, month, day):
    sections = [
        'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
        'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
    ]
    dt = datetime(year, month, day)
    day = int(dt.strftime('%j'))
    date = dt.strftime('%d.%m.%Y')
    news = []
    for section in sections:
        url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
        if date == time.strftime('%d.%m.%Y', time.localtime()):
            html = oxlib.net.getUrl(url)
        else:
            html = oxlib.cache.getUrl(url)
        for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
            dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
            try:
                description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
            except:
                description = ''
            try:
                imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
            except:
                imageUrl = ''
            try:
                title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
            except:
                title = ''
            if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
                new = {}
                if len(dateString) == 10:
                    new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
                else:
                    new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
                # fix decodeHtml
                # new['description'] = formatString(decodeHtml(description))
                new['description'] = formatString(description)
                new['imageUrl'] = imageUrl
                new['section'] = formatSection(section)
                new['title'] = formatString(title)
                new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
                if new['title1'][-1:] == ':':
                    new['title1'] = new['title1'][0:-1]
                new['title2'] = new['title'][len(new['title1']) + 2:]
                new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
                if new['url'][:1] == '/':
                    new['url'] = 'http://www.spiegel.de' + new['url']
                news.append(new)
                # print '%s, %s' % (new['section'], dateString)
            '''
            elif dateString[:10] == date and not description:
                print dateString + ' - no description'
            elif dateString[:10] == date and not imageUrl:
                print dateString + ' - no image'
            '''
    return news

def splitTitle(title):
    title1 = re.compile('(.*?): ').findall(title)[0]
    title2 = re.compile(': (.*?)$').findall(title)[0]
    return [title1, title2]

def formatString(string):
    string = string.replace('<span class="spOptiBreak"> </span>', '')
    string = string.replace('\n', ' ').replace('  ', ' ').strip()
    string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
    return string

def formatSection(string):
    return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')

def formatSubsection(string):
    # SPIEGEL, SPIEGEL special
    subsection = {
        'abi': 'Abi - und dann?',
        'formel1': 'Formel 1',
        'jobundberuf': 'Job & Beruf',
        'leben': 'Leben U21',
        'mensch': 'Mensch & Technik',
        'sonst': '',
        'staedte': u'St\xc3dte',
        'ussports': 'US-Sports',
        'wunderbar': 'wunderBAR'
    }
    if subsection.has_key(string):
        return subsection[string].replace(u'\xc3', 'ae')
    return string[:1].upper() + string[1:]
        
def getIssue(year, week):
    coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
    if not oxlib.net.exists(coverUrl):
        return None
    url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
    contents = []
    soup = BeautifulSoup(oxlib.cache.getUrl(url))
    for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
        item = str(item)
        page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
        title = stripTags(item).strip()
        contents.append({'title': title, 'page': page})
    pageUrl = {}
    pages = page + 2
    for page in range(1, pages + 10):
        url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
        if oxlib.cache.exists(url):
            pageUrl[page] = url
        else:
            pageUrl[page] = ''
    return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}


def archiveIssues():
    '''
    this is just an example of an archiving application
    '''
    p = {}
    import os
    import simplejson
    import time
    archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
    localtime = time.localtime()
    year = int(time.strftime('%Y', localtime))
    week = int(time.strftime('%W', localtime))
    for y in range(year, 1993, -1):
        if y == year:
            wMax = week + 1
        else:
            wMax = 53
        for w in range(wMax, 0, -1):
            print 'getIssue(%d, %d)' % (y, w)
            issue = getIssue(y, w)
            if issue:
                dirname = '%s/%d/%02d' % (archivePath, y, w)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
                if not os.path.exists(filename):
                    data = simplejson.dumps(issue, ensure_ascii = False)
                    f = open(filename, 'w')
                    f.write(data)
                    f.close()
                filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
                if not os.path.exists(filename):
                    data = []
                    for item in issue['contents']:
                        data.append('%3d %s' % (item['page'], item['title']))
                    data = '\n'.join(data)
                    f = open(filename, 'w')
                    f.write(data)
                    f.close()
                filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
                if not os.path.exists(filename):
                    data = oxlib.cache.getUrl(issue['coverUrl'])
                    f = open(filename, 'w')
                    f.write(data)
                    f.close()
                for page in issue['pageUrl']:
                    url = issue['pageUrl'][page]
                    if url:
                        filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
                        if not os.path.exists(filename):
                            data = oxlib.cache.getUrl(url)
                            f = open(filename, 'w')
                            f.write(data)
                            f.close()
                if not p:
                    p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
                else:
                    p['num'] += 1
                    p['sum'] += issue['pages']
                    if issue['pages'] < p['min']:
                        p['min'] = issue['pages']
                    if issue['pages'] > p['max']:
                        p['max'] = issue['pages']
                print p['min'], p['sum'] / p['num'], p['max']
            

def archiveNews():
    '''
    this is just an example of an archiving application
    '''
    import os
    import simplejson
    import time

    count = {}
    colon = []

    archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
    days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    localtime = time.localtime()
    year = int(time.strftime('%Y', localtime))
    month = int(time.strftime('%m', localtime))
    day = int(time.strftime('%d', localtime)) - 1
    for y in range(year, 1999, -1):
        if y == year:
            mMax = month
        else:
            mMax = 12
        for m in range(mMax, 0, -1):
            if y == year and m == month:
                dMax = day
            elif m == 2 and y % 4 == 0 and y % 400 != 0:
                dMax = days[m] + 1
            else:
                dMax = days[m]
            for d in range(dMax, 0, -1):
                print 'getNews(%d, %d, %d)' % (y, m, d)
                news = getNews(y, m ,d)
                for new in news:
                    dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
                    if not os.path.exists(dirname):
                        os.makedirs(dirname)
                    if new['url'][-5:] == '.html':
                        filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
                    else:
                        filename = dirname + '/' + new['url'] + '.json'
                    if not os.path.exists(filename) or True:
                        data = simplejson.dumps(new, ensure_ascii = False)
                        f = open(filename, 'w')
                        f.write(data)
                        f.close()
                    filename = filename[:-5] + '.txt'
                    if not os.path.exists(filename) or True:
                        data = splitTitle(new['title'])
                        data.append(new['description'])
                        data = '\n'.join(data)
                        f = open(filename, 'w')
                        f.write(data)
                        f.close()
                    filename = dirname + '/' + new['imageUrl'].split('/')[-1]
                    if not os.path.exists(filename):
                        data = oxlib.cache.getUrl(new['imageUrl'])
                        f = open(filename, 'w')
                        f.write(data)
                        f.close()

                    strings = new['url'].split('/')
                    string = strings[3]
                    if len(strings) == 6:
                        string += '/' + strings[4]
                    if not count.has_key(string):
                        count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
                    else:
                        count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
                    strings = splitTitle(new['title'])
                    if strings[0] != new['title1'] or strings[1] != new['title2']:
                        colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
            for key in sortDictByKey(count):
                print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
            for value in colon:
                print value

def sortDictByKey(d):
    keys = d.keys()
    keys.sort()
    return keys

if __name__ == '__main__':
    # spiegel = Spiegel(2008, 8)
    # print spiegel.getContents()
    # news = News(2001, 9, 10)
    # output(news.getNews())
    '''
    x = []
    for d in range(10, 30):
        print '2/%d' % d
        news = getNews(2008, 2, d)
        for new in news:
            strings = new['url'].split('/')
            string = formatSection(strings[3])
            if len(strings) == 6:
                string += '/' + formatSubsection(strings[4])
            if not string in x:
                x.append(string)
        print x
    '''
    # archiveIssues()
    archiveNews()
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:47:02 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
spiegel.py news 2008-04-30 18:51:27 +00:00			`from datetime import datetime`
adding spiegel.py 2008-04-30 12:24:33 +00:00			`import re`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`import time`
adding spiegel.py 2008-04-30 12:24:33 +00:00
			`from BeautifulSoup import BeautifulSoup`

rename oxutils -> oxlib 2008-07-03 09:24:49 +00:00			`import oxlib.cache`
			`from oxlib.html import decodeHtml, stripTags`
			`import oxlib.net`
spiegel.py news 2008-04-30 18:51:27 +00:00
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:47:02 +00:00
spiegel.py news 2008-04-30 18:51:27 +00:00			`def getNews(year, month, day):`
			`sections = [`
			`'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',`
			`'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'`
			`]`
			`dt = datetime(year, month, day)`
			`day = int(dt.strftime('%j'))`
			`date = dt.strftime('%d.%m.%Y')`
			`news = []`
			`for section in sections:`
			`url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`if date == time.strftime('%d.%m.%Y', time.localtime()):`
rename oxutils -> oxlib 2008-07-03 09:24:49 +00:00			`html = oxlib.net.getUrl(url)`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`else:`
rename oxutils -> oxlib 2008-07-03 09:24:49 +00:00			`html = oxlib.cache.getUrl(url)`
spiegel.py news 2008-04-30 18:51:27 +00:00			`for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):`
			`dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()`
			`try:`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])`
spiegel.py news 2008-04-30 18:51:27 +00:00			`except:`
			`description = ''`
			`try:`
			`imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]`
			`except:`
			`imageUrl = ''`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`try:`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`title = formatString(re.compile('alt=[\'\|"](.*?)[\'\|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`except:`
			`title = ''`
			`if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:`
spiegel.py news 2008-04-30 18:51:27 +00:00			`new = {}`
			`if len(dateString) == 10:`
			`new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])`
			`else:`
			`new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`# fix decodeHtml`
			`# new['description'] = formatString(decodeHtml(description))`
spiegel.py news 2008-04-30 18:51:27 +00:00			`new['description'] = formatString(description)`
			`new['imageUrl'] = imageUrl`
			`new['section'] = formatSection(section)`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`new['title'] = formatString(title)`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')`
			`if new['title1'][-1:] == ':':`
			`new['title1'] = new['title1'][0:-1]`
			`new['title2'] = new['title'][len(new['title1']) + 2:]`
			`new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]`
			`if new['url'][:1] == '/':`
			`new['url'] = 'http://www.spiegel.de' + new['url']`
spiegel.py news 2008-04-30 18:51:27 +00:00			`news.append(new)`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`# print '%s, %s' % (new['section'], dateString)`
			`'''`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`elif dateString[:10] == date and not description:`
spiegel.py news 2008-04-30 18:51:27 +00:00			`print dateString + ' - no description'`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`elif dateString[:10] == date and not imageUrl:`
spiegel.py news 2008-04-30 18:51:27 +00:00			`print dateString + ' - no image'`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`'''`
spiegel.py news 2008-04-30 18:51:27 +00:00			`return news`

spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`def splitTitle(title):`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`title1 = re.compile('(.*?): ').findall(title)[0]`
			`title2 = re.compile(': (.*?)$').findall(title)[0]`
			`return [title1, title2]`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00
spiegel.py news 2008-04-30 18:51:27 +00:00			`def formatString(string):`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`string = string.replace('<span class="spOptiBreak"> </span>', '')`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`string = string.replace('\n', ' ').replace(' ', ' ').strip()`
			`string = string.replace('&', '&').replace(''', '\'').replace('"', '"')`
			`return string`
spiegel.py news 2008-04-30 18:51:27 +00:00
			`def formatSection(string):`
			`return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')`

			`def formatSubsection(string):`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`# SPIEGEL, SPIEGEL special`
spiegel.py news 2008-04-30 18:51:27 +00:00			`subsection = {`
			`'abi': 'Abi - und dann?',`
			`'formel1': 'Formel 1',`
			`'jobundberuf': 'Job & Beruf',`
			`'leben': 'Leben U21',`
			`'mensch': 'Mensch & Technik',`
			`'sonst': '',`
			`'staedte': u'St\xc3dte',`
			`'ussports': 'US-Sports',`
			`'wunderbar': 'wunderBAR'`
			`}`
			`if subsection.has_key(string):`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`return subsection[string].replace(u'\xc3', 'ae')`
spiegel.py news 2008-04-30 18:51:27 +00:00			`return string[:1].upper() + string[1:]`

itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`def getIssue(year, week):`
spiegel.py: get actual pages 2008-05-01 09:14:10 +00:00			`coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)`
rename oxutils -> oxlib 2008-07-03 09:24:49 +00:00			`if not oxlib.net.exists(coverUrl):`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`return None`
			`url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)`
			`contents = []`
rename oxutils -> oxlib 2008-07-03 09:24:49 +00:00			`soup = BeautifulSoup(oxlib.cache.getUrl(url))`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):`
			`item = str(item)`
			`page = int(re.compile('&SE=(.*?)"').findall(item)[0])`
			`title = stripTags(item).strip()`
			`contents.append({'title': title, 'page': page})`
spiegel.py: get actual pages 2008-05-01 09:14:10 +00:00			`pageUrl = {}`
			`pages = page + 2`
			`for page in range(1, pages + 10):`
			`url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)`
rename oxutils -> oxlib 2008-07-03 09:24:49 +00:00			`if oxlib.cache.exists(url):`
spiegel.py: get actual pages 2008-05-01 09:14:10 +00:00			`pageUrl[page] = url`
			`else:`
			`pageUrl[page] = ''`
			`return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}`
adding spiegel.py 2008-04-30 12:24:33 +00:00
spiegel.py news 2008-04-30 18:51:27 +00:00
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`def archiveIssues():`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`'''`
			`this is just an example of an archiving application`
			`'''`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`p = {}`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`import os`
			`import simplejson`
			`import time`
			`archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'`
			`localtime = time.localtime()`
			`year = int(time.strftime('%Y', localtime))`
			`week = int(time.strftime('%W', localtime))`
			`for y in range(year, 1993, -1):`
			`if y == year:`
			`wMax = week + 1`
			`else:`
			`wMax = 53`
			`for w in range(wMax, 0, -1):`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`print 'getIssue(%d, %d)' % (y, w)`
			`issue = getIssue(y, w)`
			`if issue:`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`dirname = '%s/%d/%02d' % (archivePath, y, w)`
			`if not os.path.exists(dirname):`
			`os.makedirs(dirname)`
spiegel.py: get actual pages 2008-05-01 09:14:10 +00:00			`filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`if not os.path.exists(filename):`
			`data = simplejson.dumps(issue, ensure_ascii = False)`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`
spiegel.py: get actual pages 2008-05-01 09:14:10 +00:00			`filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`if not os.path.exists(filename):`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`data = []`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`for item in issue['contents']:`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`data.append('%3d %s' % (item['page'], item['title']))`
			`data = '\n'.join(data)`
			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`
spiegel.py: get actual pages 2008-05-01 09:14:10 +00:00			`filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`if not os.path.exists(filename):`
rename oxutils -> oxlib 2008-07-03 09:24:49 +00:00			`data = oxlib.cache.getUrl(issue['coverUrl'])`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`for page in issue['pageUrl']:`
			`url = issue['pageUrl'][page]`
spiegel.py: get actual pages 2008-05-01 09:14:10 +00:00			`if url:`
			`filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)`
			`if not os.path.exists(filename):`
rename oxutils -> oxlib 2008-07-03 09:24:49 +00:00			`data = oxlib.cache.getUrl(url)`
spiegel.py: get actual pages 2008-05-01 09:14:10 +00:00			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`if not p:`
			`p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}`
			`else:`
			`p['num'] += 1`
			`p['sum'] += issue['pages']`
			`if issue['pages'] < p['min']:`
			`p['min'] = issue['pages']`
			`if issue['pages'] > p['max']:`
			`p['max'] = issue['pages']`
			`print p['min'], p['sum'] / p['num'], p['max']`

spiegel.py news 2008-04-30 18:51:27 +00:00
			`def archiveNews():`
			`'''`
			`this is just an example of an archiving application`
			`'''`
			`import os`
			`import simplejson`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`import time`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00
			`count = {}`
			`colon = []`

spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`localtime = time.localtime()`
			`year = int(time.strftime('%Y', localtime))`
			`month = int(time.strftime('%m', localtime))`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`day = int(time.strftime('%d', localtime)) - 1`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`for y in range(year, 1999, -1):`
			`if y == year:`
			`mMax = month`
			`else:`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`mMax = 12`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`for m in range(mMax, 0, -1):`
			`if y == year and m == month:`
			`dMax = day`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`elif m == 2 and y % 4 == 0 and y % 400 != 0:`
			`dMax = days[m] + 1`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`else:`
			`dMax = days[m]`
			`for d in range(dMax, 0, -1):`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`print 'getNews(%d, %d, %d)' % (y, m, d)`
spiegel.py news 2008-04-30 18:51:27 +00:00			`news = getNews(y, m ,d)`
			`for new in news:`
			`dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]`
			`if not os.path.exists(dirname):`
			`os.makedirs(dirname)`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`if new['url'][-5:] == '.html':`
			`filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'`
			`else:`
			`filename = dirname + '/' + new['url'] + '.json'`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`if not os.path.exists(filename) or True:`
			`data = simplejson.dumps(new, ensure_ascii = False)`
			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`filename = filename[:-5] + '.txt'`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`if not os.path.exists(filename) or True:`
			`data = splitTitle(new['title'])`
			`data.append(new['description'])`
			`data = '\n'.join(data)`
			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`
spiegel.py news 2008-04-30 18:51:27 +00:00			`filename = dirname + '/' + new['imageUrl'].split('/')[-1]`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`if not os.path.exists(filename):`
rename oxutils -> oxlib 2008-07-03 09:24:49 +00:00			`data = oxlib.cache.getUrl(new['imageUrl'])`
spiegel.py: cleanup 2008-04-30 23:27:50 +00:00			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`
spiegel.py news 2008-04-30 18:51:27 +00:00
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`strings = new['url'].split('/')`
			`string = strings[3]`
			`if len(strings) == 6:`
			`string += '/' + strings[4]`
			`if not count.has_key(string):`
			`count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}`
			`else:`
			`count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}`
			`strings = splitTitle(new['title'])`
			`if strings[0] != new['title1'] or strings[1] != new['title2']:`
			`colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))`
			`for key in sortDictByKey(count):`
			`print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])`
			`for value in colon:`
			`print value`

			`def sortDictByKey(d):`
			`keys = d.keys()`
			`keys.sort()`
			`return keys`
adding spiegel.py 2008-04-30 12:24:33 +00:00
			`if __name__ == '__main__':`
spiegel.py news 2008-04-30 18:51:27 +00:00			`# spiegel = Spiegel(2008, 8)`
			`# print spiegel.getContents()`
			`# news = News(2001, 9, 10)`
			`# output(news.getNews())`
			`'''`
			`x = []`
			`for d in range(10, 30):`
			`print '2/%d' % d`
			`news = getNews(2008, 2, d)`
			`for new in news:`
			`strings = new['url'].split('/')`
			`string = formatSection(strings[3])`
			`if len(strings) == 6:`
			`string += '/' + formatSubsection(strings[4])`
			`if not string in x:`
			`x.append(string)`
			`print x`
			`'''`
itunes.py: getting rid of findString() 2008-05-07 08:15:25 +00:00			`# archiveIssues()`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:47:02 +00:00			`archiveNews()`