merge changes from mars, itunes, spiegel

This commit is contained in:
j 2008-05-07 11:50:09 +02:00
commit 82a1dca6c3
2 changed files with 99 additions and 47 deletions

View file

@ -2,6 +2,8 @@ import re
import urllib import urllib
from oxutils.cache import getUrl from oxutils.cache import getUrl
from oxutils.html import decodeHtml, stripTags
from oxutils.text import findRe
from oxutils.text import findString from oxutils.text import findString
# to sniff itunes traffic, use something like # to sniff itunes traffic, use something like
@ -42,17 +44,16 @@ def parseXmlDict(xml):
strings = xml.split('<key>') strings = xml.split('<key>')
for string in strings: for string in strings:
if string.find('</key>') != -1: if string.find('</key>') != -1:
key = findString(string, '', '</key>') key = findRe(string, '(.*?)</key>')
type = findString(string, '</key><', '>') type = findRe(string, '</key><(.*?)>')
if type == 'true/': if type == 'true/':
value = True value = True
else: else:
value = findString(string, '<%s>' % type, '</%s>' % type) value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
if type == 'integer': if type == 'integer':
value = int(value) value = int(value)
elif type == 'string': elif type == 'string':
value = value.replace('&#38;', '&') value = decodeHtml(value)
value = value.replace('&#39;', '\'')
values[key] = value values[key] = value
return values return values
@ -65,28 +66,28 @@ class ItunesAlbum:
def getId(self): def getId(self):
url = composeUrl('advancedSearch', {'title': self.title, 'artist': self.artist}) url = composeUrl('advancedSearch', {'title': self.title, 'artist': self.artist})
xml = getUrl(url, None, ITUNES_HEADERS) xml = getUrl(url, None, ITUNES_HEADERS)
id = findString(xml, 'viewAlbum?id=', '&') id = findRe(xml, 'viewAlbum\?id=(.*?)&')
return id return id
def getData(self): def getData(self):
data = {'id': self.id} data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id}) url = composeUrl('viewAlbum', {'id': self.id})
xml = getUrl(url, None, ITUNES_HEADERS) xml = getUrl(url, None, ITUNES_HEADERS)
xml = findString(xml, '<View>') data['albumName'] = findRe(xml, '<B>(.*?)</B>')
data['albumName'] = findString(xml, '<B>', '<') data['artistName'] = findRe(xml, '<b>(.*?)</b>')
data['artistName'] = findString(xml, '<b>', '<') data['coverUrl'] = findRe(xml, 'reflection="1" url="(.*?)"')
data['coverUrl'] = findString(xml, 'reflection="1" url="', '"') data['genre'] = findRe(xml, 'Genre:(.*?)<')
data['genre'] = findString(xml, 'Genre: ', '<') data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['releaseDate'] = findString(xml, 'Released', '<') data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['review'] = findString(findString(xml, 'REVIEW</b>'), '<SetFontStyle normalStyle="textColor">', '</SetFontStyle>')
data['tracks'] = [] data['tracks'] = []
string = findString(findString(xml, '<key>items</key>', '</array>'), '<dict>') strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
strings = string.split('<dict>')
for string in strings: for string in strings:
data['tracks'].append(parseXmlDict(string)) data['tracks'].append(parseXmlDict(string))
data['type'] = findString(xml, '<key>listType</key><string>', '<') data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
return data return data
if __name__ == '__main__': if __name__ == '__main__':
test = ItunesAlbum('So Red the Rose', 'Arcadia') import simplejson
print test.getData() data = ItunesAlbum('So Red the Rose', 'Arcadia').getData()
print simplejson.dumps(data, sort_keys = True, indent = 4)
# print test.getData()

View file

@ -26,7 +26,7 @@ def getNews(year, month, day):
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html): for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip() dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try: try:
description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip() description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
except: except:
description = '' description = ''
try: try:
@ -34,7 +34,7 @@ def getNews(year, month, day):
except: except:
imageUrl = '' imageUrl = ''
try: try:
title = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0]) title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
except: except:
title = '' title = ''
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1: if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
@ -49,21 +49,30 @@ def getNews(year, month, day):
new['imageUrl'] = imageUrl new['imageUrl'] = imageUrl
new['section'] = formatSection(section) new['section'] = formatSection(section)
new['title'] = formatString(title) new['title'] = formatString(title)
new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0] new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
if new['title1'][-1:] == ':':
new['title1'] = new['title1'][0:-1]
new['title2'] = new['title'][len(new['title1']) + 2:]
new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
if new['url'][:1] == '/':
new['url'] = 'http://www.spiegel.de' + new['url']
news.append(new) news.append(new)
print dateString + ' - ok' # print '%s, %s' % (new['section'], dateString)
'''
elif dateString[:10] == date and not description: elif dateString[:10] == date and not description:
print dateString + ' - no description' print dateString + ' - no description'
elif dateString[:10] == date and not imageUrl: elif dateString[:10] == date and not imageUrl:
print dateString + ' - no image' print dateString + ' - no image'
'''
return news return news
def splitTitle(title): def splitTitle(title):
title0 = re.compile('(.*?): ').findall(title)[0] title1 = re.compile('(.*?): ').findall(title)[0]
title1 = re.compile(': (.*?)$').findall(title)[0] title2 = re.compile(': (.*?)$').findall(title)[0]
return [title0, title1] return [title1, title2]
def formatString(string): def formatString(string):
string = string.replace('<span class="spOptiBreak"> </span>', '')
string = string.replace('\n', ' ').replace(' ', ' ').strip() string = string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"') string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
return string return string
@ -72,6 +81,7 @@ def formatSection(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL') return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
def formatSubsection(string): def formatSubsection(string):
# SPIEGEL, SPIEGEL special
subsection = { subsection = {
'abi': 'Abi - und dann?', 'abi': 'Abi - und dann?',
'formel1': 'Formel 1', 'formel1': 'Formel 1',
@ -84,10 +94,10 @@ def formatSubsection(string):
'wunderbar': 'wunderBAR' 'wunderbar': 'wunderBAR'
} }
if subsection.has_key(string): if subsection.has_key(string):
return subsection[string] return subsection[string].replace(u'\xc3', 'ae')
return string[:1].upper() + string[1:] return string[:1].upper() + string[1:]
def getMagazine(year, week): def getIssue(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week) coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not oxutils.net.exists(coverUrl): if not oxutils.net.exists(coverUrl):
return None return None
@ -103,17 +113,18 @@ def getMagazine(year, week):
pages = page + 2 pages = page + 2
for page in range(1, pages + 10): for page in range(1, pages + 10):
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page) url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
if oxutils.net.exists(url): if oxutils.cache.exists(url):
pageUrl[page] = url pageUrl[page] = url
else: else:
pageUrl[page] = '' pageUrl[page] = ''
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl} return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
def archiveMagazines(): def archiveIssues():
''' '''
this is just an example of an archiving application this is just an example of an archiving application
''' '''
p = {}
import os import os
import simplejson import simplejson
import time import time
@ -127,22 +138,22 @@ def archiveMagazines():
else: else:
wMax = 53 wMax = 53
for w in range(wMax, 0, -1): for w in range(wMax, 0, -1):
print '%2d/%d' % (w, y) print 'getIssue(%d, %d)' % (y, w)
magazine = getMagazine(y, w) issue = getIssue(y, w)
if magazine: if issue:
dirname = '%s/%d/%02d' % (archivePath, y, w) dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname): if not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w) filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
if not os.path.exists(filename) or True: if not os.path.exists(filename):
data = simplejson.dumps(magazine, ensure_ascii = False) data = simplejson.dumps(issue, ensure_ascii = False)
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w) filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
if not os.path.exists(filename) or True: if not os.path.exists(filename):
data = [] data = []
for item in magazine['contents']: for item in issue['contents']:
data.append('%3d %s' % (item['page'], item['title'])) data.append('%3d %s' % (item['page'], item['title']))
data = '\n'.join(data) data = '\n'.join(data)
f = open(filename, 'w') f = open(filename, 'w')
@ -150,12 +161,12 @@ def archiveMagazines():
f.close() f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w) filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename): if not os.path.exists(filename):
data = oxutils.cache.getUrl(magazine['coverUrl']) data = oxutils.cache.getUrl(issue['coverUrl'])
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
for page in magazine['pageUrl']: for page in issue['pageUrl']:
url = magazine['pageUrl'][page] url = issue['pageUrl'][page]
if url: if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page) filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename): if not os.path.exists(filename):
@ -163,7 +174,17 @@ def archiveMagazines():
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
if not p:
p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
else:
p['num'] += 1
p['sum'] += issue['pages']
if issue['pages'] < p['min']:
p['min'] = issue['pages']
if issue['pages'] > p['max']:
p['max'] = issue['pages']
print p['min'], p['sum'] / p['num'], p['max']
def archiveNews(): def archiveNews():
''' '''
@ -172,35 +193,45 @@ def archiveNews():
import os import os
import simplejson import simplejson
import time import time
count = {}
colon = []
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online' archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
days = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
localtime = time.localtime() localtime = time.localtime()
year = int(time.strftime('%Y', localtime)) year = int(time.strftime('%Y', localtime))
month = int(time.strftime('%m', localtime)) month = int(time.strftime('%m', localtime))
day = int(time.strftime('%d', localtime)) day = int(time.strftime('%d', localtime)) - 1
for y in range(year, 1999, -1): for y in range(year, 1999, -1):
if y == year: if y == year:
mMax = month mMax = month
else: else:
mMax = m mMax = 12
for m in range(mMax, 0, -1): for m in range(mMax, 0, -1):
if y == year and m == month: if y == year and m == month:
dMax = day dMax = day
elif m == 2 and y % 4 == 0 and y % 400 != 0:
dMax = days[m] + 1
else: else:
dMax = days[m] dMax = days[m]
for d in range(dMax, 0, -1): for d in range(dMax, 0, -1):
print 'getNews(%d, %d, %d)' % (y, m, d)
news = getNews(y, m ,d) news = getNews(y, m ,d)
for new in news: for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16] dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname): if not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json' if new['url'][-5:] == '.html':
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
else:
filename = dirname + '/' + new['url'] + '.json'
if not os.path.exists(filename) or True: if not os.path.exists(filename) or True:
data = simplejson.dumps(new, ensure_ascii = False) data = simplejson.dumps(new, ensure_ascii = False)
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
filename = filename[:-4] + 'txt' filename = filename[:-5] + '.txt'
if not os.path.exists(filename) or True: if not os.path.exists(filename) or True:
data = splitTitle(new['title']) data = splitTitle(new['title'])
data.append(new['description']) data.append(new['description'])
@ -215,6 +246,26 @@ def archiveNews():
f.write(data) f.write(data)
f.close() f.close()
strings = new['url'].split('/')
string = strings[3]
if len(strings) == 6:
string += '/' + strings[4]
if not count.has_key(string):
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
else:
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
strings = splitTitle(new['title'])
if strings[0] != new['title1'] or strings[1] != new['title2']:
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
for key in sortDictByKey(count):
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
for value in colon:
print value
def sortDictByKey(d):
keys = d.keys()
keys.sort()
return keys
if __name__ == '__main__': if __name__ == '__main__':
# spiegel = Spiegel(2008, 8) # spiegel = Spiegel(2008, 8)
@ -235,5 +286,5 @@ if __name__ == '__main__':
x.append(string) x.append(string)
print x print x
''' '''
archiveMagazines() # archiveIssues()
archiveNews() archiveNews()