From d04877e1a2ceb9e50c7cc284fe15d9e86f6ddc11 Mon Sep 17 00:00:00 2001 From: Rolux Date: Wed, 7 May 2008 10:15:25 +0200 Subject: [PATCH] itunes.py: getting rid of findString() --- ox/itunes.py | 39 +++++++++--------- ox/spiegel.py | 107 +++++++++++++++++++++++++++++++++++++------------- 2 files changed, 99 insertions(+), 47 deletions(-) diff --git a/ox/itunes.py b/ox/itunes.py index 7e94403..da9c6b3 100644 --- a/ox/itunes.py +++ b/ox/itunes.py @@ -2,6 +2,8 @@ import re import urllib from oxutils.cache import getUrl +from oxutils.html import decodeHtml, stripTags +from oxutils.text import findRe from oxutils.text import findString # to sniff itunes traffic, use something like @@ -42,17 +44,16 @@ def parseXmlDict(xml): strings = xml.split('') for string in strings: if string.find('') != -1: - key = findString(string, '', '') - type = findString(string, '<', '>') + key = findRe(string, '(.*?)') + type = findRe(string, '<(.*?)>') if type == 'true/': value = True else: - value = findString(string, '<%s>' % type, '' % type) + value = findRe(string, '<%s>(.*?)' % (type, type)) if type == 'integer': value = int(value) elif type == 'string': - value = value.replace('&', '&') - value = value.replace(''', '\'') + value = decodeHtml(value) values[key] = value return values @@ -65,28 +66,28 @@ class ItunesAlbum: def getId(self): url = composeUrl('advancedSearch', {'title': self.title, 'artist': self.artist}) xml = getUrl(url, None, ITUNES_HEADERS) - id = findString(xml, 'viewAlbum?id=', '&') + id = findRe(xml, 'viewAlbum\?id=(.*?)&') return id def getData(self): data = {'id': self.id} url = composeUrl('viewAlbum', {'id': self.id}) xml = getUrl(url, None, ITUNES_HEADERS) - xml = findString(xml, '') - data['albumName'] = findString(xml, '', '<') - data['artistName'] = findString(xml, '', '<') - data['coverUrl'] = findString(xml, 'reflection="1" url="', '"') - data['genre'] = findString(xml, 'Genre: ', '<') - data['releaseDate'] = findString(xml, 'Released', '<') - data['review'] = findString(findString(xml, 'REVIEW'), '', '') + data['albumName'] = findRe(xml, '(.*?)') + data['artistName'] = findRe(xml, '(.*?)') + data['coverUrl'] = findRe(xml, 'reflection="1" url="(.*?)"') + data['genre'] = findRe(xml, 'Genre:(.*?)<') + data['releaseDate'] = findRe(xml, 'Released(.*?)<') + data['review'] = stripTags(findRe(xml, 'REVIEW.*?(.*?)')) data['tracks'] = [] - string = findString(findString(xml, 'items', ''), '') - strings = string.split('') + strings = findRe(xml, 'items.*?(.*?)$').split('') for string in strings: - data['tracks'].append(parseXmlDict(string)) - data['type'] = findString(xml, 'listType', '<') + data['tracks'].append(parseXmlDict(string)) + data['type'] = findRe(xml, 'listType(.*?)<') return data if __name__ == '__main__': - test = ItunesAlbum('So Red the Rose', 'Arcadia') - print test.getData() \ No newline at end of file + import simplejson + data = ItunesAlbum('So Red the Rose', 'Arcadia').getData() + print simplejson.dumps(data, sort_keys = True, indent = 4) + # print test.getData() \ No newline at end of file diff --git a/ox/spiegel.py b/ox/spiegel.py index 49481ef..492171e 100644 --- a/ox/spiegel.py +++ b/ox/spiegel.py @@ -26,7 +26,7 @@ def getNews(year, month, day): for item in re.compile('
(.*?)
', re.DOTALL).findall(item)[0]).strip() try: - description = re.compile('

(.*?)<', re.DOTALL).findall(item)[0].strip() + description = formatString(re.compile('

(.*?)<', re.DOTALL).findall(item)[0]) except: description = '' try: @@ -34,7 +34,7 @@ def getNews(year, month, day): except: imageUrl = '' try: - title = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0]) + title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':') except: title = '' if dateString[:10] == date and description and imageUrl and title.find(': ') != -1: @@ -49,21 +49,30 @@ def getNews(year, month, day): new['imageUrl'] = imageUrl new['section'] = formatSection(section) new['title'] = formatString(title) - new['url'] = 'http://www.spiegel.de' + re.compile('(.*?)', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf') + if new['title1'][-1:] == ':': + new['title1'] = new['title1'][0:-1] + new['title2'] = new['title'][len(new['title1']) + 2:] + new['url'] = re.compile(' ', '') string = string.replace('\n', ' ').replace(' ', ' ').strip() string = string.replace('&', '&').replace(''', '\'').replace('"', '"') return string @@ -72,6 +81,7 @@ def formatSection(string): return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL') def formatSubsection(string): + # SPIEGEL, SPIEGEL special subsection = { 'abi': 'Abi - und dann?', 'formel1': 'Formel 1', @@ -84,10 +94,10 @@ def formatSubsection(string): 'wunderbar': 'wunderBAR' } if subsection.has_key(string): - return subsection[string] + return subsection[string].replace(u'\xc3', 'ae') return string[:1].upper() + string[1:] -def getMagazine(year, week): +def getIssue(year, week): coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week) if not oxutils.net.exists(coverUrl): return None @@ -103,17 +113,18 @@ def getMagazine(year, week): pages = page + 2 for page in range(1, pages + 10): url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page) - if oxutils.net.exists(url): + if oxutils.cache.exists(url): pageUrl[page] = url else: pageUrl[page] = '' return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl} -def archiveMagazines(): +def archiveIssues(): ''' this is just an example of an archiving application ''' + p = {} import os import simplejson import time @@ -127,22 +138,22 @@ def archiveMagazines(): else: wMax = 53 for w in range(wMax, 0, -1): - print '%2d/%d' % (w, y) - magazine = getMagazine(y, w) - if magazine: + print 'getIssue(%d, %d)' % (y, w) + issue = getIssue(y, w) + if issue: dirname = '%s/%d/%02d' % (archivePath, y, w) if not os.path.exists(dirname): os.makedirs(dirname) filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w) - if not os.path.exists(filename) or True: - data = simplejson.dumps(magazine, ensure_ascii = False) + if not os.path.exists(filename): + data = simplejson.dumps(issue, ensure_ascii = False) f = open(filename, 'w') f.write(data) f.close() filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w) - if not os.path.exists(filename) or True: + if not os.path.exists(filename): data = [] - for item in magazine['contents']: + for item in issue['contents']: data.append('%3d %s' % (item['page'], item['title'])) data = '\n'.join(data) f = open(filename, 'w') @@ -150,12 +161,12 @@ def archiveMagazines(): f.close() filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w) if not os.path.exists(filename): - data = oxutils.cache.getUrl(magazine['coverUrl']) + data = oxutils.cache.getUrl(issue['coverUrl']) f = open(filename, 'w') f.write(data) f.close() - for page in magazine['pageUrl']: - url = magazine['pageUrl'][page] + for page in issue['pageUrl']: + url = issue['pageUrl'][page] if url: filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page) if not os.path.exists(filename): @@ -163,7 +174,17 @@ def archiveMagazines(): f = open(filename, 'w') f.write(data) f.close() - + if not p: + p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']} + else: + p['num'] += 1 + p['sum'] += issue['pages'] + if issue['pages'] < p['min']: + p['min'] = issue['pages'] + if issue['pages'] > p['max']: + p['max'] = issue['pages'] + print p['min'], p['sum'] / p['num'], p['max'] + def archiveNews(): ''' @@ -172,35 +193,45 @@ def archiveNews(): import os import simplejson import time + + count = {} + colon = [] + archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online' - days = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] + days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] localtime = time.localtime() year = int(time.strftime('%Y', localtime)) month = int(time.strftime('%m', localtime)) - day = int(time.strftime('%d', localtime)) + day = int(time.strftime('%d', localtime)) - 1 for y in range(year, 1999, -1): if y == year: mMax = month else: - mMax = m + mMax = 12 for m in range(mMax, 0, -1): if y == year and m == month: dMax = day + elif m == 2 and y % 4 == 0 and y % 400 != 0: + dMax = days[m] + 1 else: dMax = days[m] for d in range(dMax, 0, -1): + print 'getNews(%d, %d, %d)' % (y, m, d) news = getNews(y, m ,d) for new in news: dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16] if not os.path.exists(dirname): os.makedirs(dirname) - filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json' + if new['url'][-5:] == '.html': + filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json' + else: + filename = dirname + '/' + new['url'] + '.json' if not os.path.exists(filename) or True: data = simplejson.dumps(new, ensure_ascii = False) f = open(filename, 'w') f.write(data) f.close() - filename = filename[:-4] + 'txt' + filename = filename[:-5] + '.txt' if not os.path.exists(filename) or True: data = splitTitle(new['title']) data.append(new['description']) @@ -215,6 +246,26 @@ def archiveNews(): f.write(data) f.close() + strings = new['url'].split('/') + string = strings[3] + if len(strings) == 6: + string += '/' + strings[4] + if not count.has_key(string): + count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))} + else: + count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])} + strings = splitTitle(new['title']) + if strings[0] != new['title1'] or strings[1] != new['title2']: + colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2'])) + for key in sortDictByKey(count): + print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string']) + for value in colon: + print value + +def sortDictByKey(d): + keys = d.keys() + keys.sort() + return keys if __name__ == '__main__': # spiegel = Spiegel(2008, 8) @@ -235,5 +286,5 @@ if __name__ == '__main__': x.append(string) print x ''' - archiveMagazines() + # archiveIssues() archiveNews() \ No newline at end of file