itunes.py: getting rid of findString()
This commit is contained in:
parent
1b93ae048d
commit
d04877e1a2
2 changed files with 99 additions and 47 deletions
39
ox/itunes.py
39
ox/itunes.py
|
@ -2,6 +2,8 @@ import re
|
||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
from oxutils.cache import getUrl
|
from oxutils.cache import getUrl
|
||||||
|
from oxutils.html import decodeHtml, stripTags
|
||||||
|
from oxutils.text import findRe
|
||||||
from oxutils.text import findString
|
from oxutils.text import findString
|
||||||
|
|
||||||
# to sniff itunes traffic, use something like
|
# to sniff itunes traffic, use something like
|
||||||
|
@ -42,17 +44,16 @@ def parseXmlDict(xml):
|
||||||
strings = xml.split('<key>')
|
strings = xml.split('<key>')
|
||||||
for string in strings:
|
for string in strings:
|
||||||
if string.find('</key>') != -1:
|
if string.find('</key>') != -1:
|
||||||
key = findString(string, '', '</key>')
|
key = findRe(string, '(.*?)</key>')
|
||||||
type = findString(string, '</key><', '>')
|
type = findRe(string, '</key><(.*?)>')
|
||||||
if type == 'true/':
|
if type == 'true/':
|
||||||
value = True
|
value = True
|
||||||
else:
|
else:
|
||||||
value = findString(string, '<%s>' % type, '</%s>' % type)
|
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
|
||||||
if type == 'integer':
|
if type == 'integer':
|
||||||
value = int(value)
|
value = int(value)
|
||||||
elif type == 'string':
|
elif type == 'string':
|
||||||
value = value.replace('&', '&')
|
value = decodeHtml(value)
|
||||||
value = value.replace(''', '\'')
|
|
||||||
values[key] = value
|
values[key] = value
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
@ -65,28 +66,28 @@ class ItunesAlbum:
|
||||||
def getId(self):
|
def getId(self):
|
||||||
url = composeUrl('advancedSearch', {'title': self.title, 'artist': self.artist})
|
url = composeUrl('advancedSearch', {'title': self.title, 'artist': self.artist})
|
||||||
xml = getUrl(url, None, ITUNES_HEADERS)
|
xml = getUrl(url, None, ITUNES_HEADERS)
|
||||||
id = findString(xml, 'viewAlbum?id=', '&')
|
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def getData(self):
|
def getData(self):
|
||||||
data = {'id': self.id}
|
data = {'id': self.id}
|
||||||
url = composeUrl('viewAlbum', {'id': self.id})
|
url = composeUrl('viewAlbum', {'id': self.id})
|
||||||
xml = getUrl(url, None, ITUNES_HEADERS)
|
xml = getUrl(url, None, ITUNES_HEADERS)
|
||||||
xml = findString(xml, '<View>')
|
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
||||||
data['albumName'] = findString(xml, '<B>', '<')
|
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
||||||
data['artistName'] = findString(xml, '<b>', '<')
|
data['coverUrl'] = findRe(xml, 'reflection="1" url="(.*?)"')
|
||||||
data['coverUrl'] = findString(xml, 'reflection="1" url="', '"')
|
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
||||||
data['genre'] = findString(xml, 'Genre: ', '<')
|
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||||
data['releaseDate'] = findString(xml, 'Released', '<')
|
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
data['review'] = findString(findString(xml, 'REVIEW</b>'), '<SetFontStyle normalStyle="textColor">', '</SetFontStyle>')
|
|
||||||
data['tracks'] = []
|
data['tracks'] = []
|
||||||
string = findString(findString(xml, '<key>items</key>', '</array>'), '<dict>')
|
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||||
strings = string.split('<dict>')
|
|
||||||
for string in strings:
|
for string in strings:
|
||||||
data['tracks'].append(parseXmlDict(string))
|
data['tracks'].append(parseXmlDict(string))
|
||||||
data['type'] = findString(xml, '<key>listType</key><string>', '<')
|
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test = ItunesAlbum('So Red the Rose', 'Arcadia')
|
import simplejson
|
||||||
print test.getData()
|
data = ItunesAlbum('So Red the Rose', 'Arcadia').getData()
|
||||||
|
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||||
|
# print test.getData()
|
105
ox/spiegel.py
105
ox/spiegel.py
|
@ -26,7 +26,7 @@ def getNews(year, month, day):
|
||||||
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||||
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||||
try:
|
try:
|
||||||
description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip()
|
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
||||||
except:
|
except:
|
||||||
description = ''
|
description = ''
|
||||||
try:
|
try:
|
||||||
|
@ -34,7 +34,7 @@ def getNews(year, month, day):
|
||||||
except:
|
except:
|
||||||
imageUrl = ''
|
imageUrl = ''
|
||||||
try:
|
try:
|
||||||
title = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
|
title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
|
||||||
except:
|
except:
|
||||||
title = ''
|
title = ''
|
||||||
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
|
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
|
||||||
|
@ -49,21 +49,30 @@ def getNews(year, month, day):
|
||||||
new['imageUrl'] = imageUrl
|
new['imageUrl'] = imageUrl
|
||||||
new['section'] = formatSection(section)
|
new['section'] = formatSection(section)
|
||||||
new['title'] = formatString(title)
|
new['title'] = formatString(title)
|
||||||
new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
|
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
|
||||||
|
if new['title1'][-1:] == ':':
|
||||||
|
new['title1'] = new['title1'][0:-1]
|
||||||
|
new['title2'] = new['title'][len(new['title1']) + 2:]
|
||||||
|
new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
|
||||||
|
if new['url'][:1] == '/':
|
||||||
|
new['url'] = 'http://www.spiegel.de' + new['url']
|
||||||
news.append(new)
|
news.append(new)
|
||||||
print dateString + ' - ok'
|
# print '%s, %s' % (new['section'], dateString)
|
||||||
|
'''
|
||||||
elif dateString[:10] == date and not description:
|
elif dateString[:10] == date and not description:
|
||||||
print dateString + ' - no description'
|
print dateString + ' - no description'
|
||||||
elif dateString[:10] == date and not imageUrl:
|
elif dateString[:10] == date and not imageUrl:
|
||||||
print dateString + ' - no image'
|
print dateString + ' - no image'
|
||||||
|
'''
|
||||||
return news
|
return news
|
||||||
|
|
||||||
def splitTitle(title):
|
def splitTitle(title):
|
||||||
title0 = re.compile('(.*?): ').findall(title)[0]
|
title1 = re.compile('(.*?): ').findall(title)[0]
|
||||||
title1 = re.compile(': (.*?)$').findall(title)[0]
|
title2 = re.compile(': (.*?)$').findall(title)[0]
|
||||||
return [title0, title1]
|
return [title1, title2]
|
||||||
|
|
||||||
def formatString(string):
|
def formatString(string):
|
||||||
|
string = string.replace('<span class="spOptiBreak"> </span>', '')
|
||||||
string = string.replace('\n', ' ').replace(' ', ' ').strip()
|
string = string.replace('\n', ' ').replace(' ', ' ').strip()
|
||||||
string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
|
string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
|
||||||
return string
|
return string
|
||||||
|
@ -72,6 +81,7 @@ def formatSection(string):
|
||||||
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
||||||
|
|
||||||
def formatSubsection(string):
|
def formatSubsection(string):
|
||||||
|
# SPIEGEL, SPIEGEL special
|
||||||
subsection = {
|
subsection = {
|
||||||
'abi': 'Abi - und dann?',
|
'abi': 'Abi - und dann?',
|
||||||
'formel1': 'Formel 1',
|
'formel1': 'Formel 1',
|
||||||
|
@ -84,10 +94,10 @@ def formatSubsection(string):
|
||||||
'wunderbar': 'wunderBAR'
|
'wunderbar': 'wunderBAR'
|
||||||
}
|
}
|
||||||
if subsection.has_key(string):
|
if subsection.has_key(string):
|
||||||
return subsection[string]
|
return subsection[string].replace(u'\xc3', 'ae')
|
||||||
return string[:1].upper() + string[1:]
|
return string[:1].upper() + string[1:]
|
||||||
|
|
||||||
def getMagazine(year, week):
|
def getIssue(year, week):
|
||||||
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
|
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
|
||||||
if not oxutils.net.exists(coverUrl):
|
if not oxutils.net.exists(coverUrl):
|
||||||
return None
|
return None
|
||||||
|
@ -103,17 +113,18 @@ def getMagazine(year, week):
|
||||||
pages = page + 2
|
pages = page + 2
|
||||||
for page in range(1, pages + 10):
|
for page in range(1, pages + 10):
|
||||||
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
|
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
|
||||||
if oxutils.net.exists(url):
|
if oxutils.cache.exists(url):
|
||||||
pageUrl[page] = url
|
pageUrl[page] = url
|
||||||
else:
|
else:
|
||||||
pageUrl[page] = ''
|
pageUrl[page] = ''
|
||||||
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
|
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
|
||||||
|
|
||||||
|
|
||||||
def archiveMagazines():
|
def archiveIssues():
|
||||||
'''
|
'''
|
||||||
this is just an example of an archiving application
|
this is just an example of an archiving application
|
||||||
'''
|
'''
|
||||||
|
p = {}
|
||||||
import os
|
import os
|
||||||
import simplejson
|
import simplejson
|
||||||
import time
|
import time
|
||||||
|
@ -127,22 +138,22 @@ def archiveMagazines():
|
||||||
else:
|
else:
|
||||||
wMax = 53
|
wMax = 53
|
||||||
for w in range(wMax, 0, -1):
|
for w in range(wMax, 0, -1):
|
||||||
print '%2d/%d' % (w, y)
|
print 'getIssue(%d, %d)' % (y, w)
|
||||||
magazine = getMagazine(y, w)
|
issue = getIssue(y, w)
|
||||||
if magazine:
|
if issue:
|
||||||
dirname = '%s/%d/%02d' % (archivePath, y, w)
|
dirname = '%s/%d/%02d' % (archivePath, y, w)
|
||||||
if not os.path.exists(dirname):
|
if not os.path.exists(dirname):
|
||||||
os.makedirs(dirname)
|
os.makedirs(dirname)
|
||||||
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
|
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
|
||||||
if not os.path.exists(filename) or True:
|
if not os.path.exists(filename):
|
||||||
data = simplejson.dumps(magazine, ensure_ascii = False)
|
data = simplejson.dumps(issue, ensure_ascii = False)
|
||||||
f = open(filename, 'w')
|
f = open(filename, 'w')
|
||||||
f.write(data)
|
f.write(data)
|
||||||
f.close()
|
f.close()
|
||||||
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
|
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
|
||||||
if not os.path.exists(filename) or True:
|
if not os.path.exists(filename):
|
||||||
data = []
|
data = []
|
||||||
for item in magazine['contents']:
|
for item in issue['contents']:
|
||||||
data.append('%3d %s' % (item['page'], item['title']))
|
data.append('%3d %s' % (item['page'], item['title']))
|
||||||
data = '\n'.join(data)
|
data = '\n'.join(data)
|
||||||
f = open(filename, 'w')
|
f = open(filename, 'w')
|
||||||
|
@ -150,12 +161,12 @@ def archiveMagazines():
|
||||||
f.close()
|
f.close()
|
||||||
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
|
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
data = oxutils.cache.getUrl(magazine['coverUrl'])
|
data = oxutils.cache.getUrl(issue['coverUrl'])
|
||||||
f = open(filename, 'w')
|
f = open(filename, 'w')
|
||||||
f.write(data)
|
f.write(data)
|
||||||
f.close()
|
f.close()
|
||||||
for page in magazine['pageUrl']:
|
for page in issue['pageUrl']:
|
||||||
url = magazine['pageUrl'][page]
|
url = issue['pageUrl'][page]
|
||||||
if url:
|
if url:
|
||||||
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
|
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
|
@ -163,6 +174,16 @@ def archiveMagazines():
|
||||||
f = open(filename, 'w')
|
f = open(filename, 'w')
|
||||||
f.write(data)
|
f.write(data)
|
||||||
f.close()
|
f.close()
|
||||||
|
if not p:
|
||||||
|
p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
|
||||||
|
else:
|
||||||
|
p['num'] += 1
|
||||||
|
p['sum'] += issue['pages']
|
||||||
|
if issue['pages'] < p['min']:
|
||||||
|
p['min'] = issue['pages']
|
||||||
|
if issue['pages'] > p['max']:
|
||||||
|
p['max'] = issue['pages']
|
||||||
|
print p['min'], p['sum'] / p['num'], p['max']
|
||||||
|
|
||||||
|
|
||||||
def archiveNews():
|
def archiveNews():
|
||||||
|
@ -172,35 +193,45 @@ def archiveNews():
|
||||||
import os
|
import os
|
||||||
import simplejson
|
import simplejson
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
count = {}
|
||||||
|
colon = []
|
||||||
|
|
||||||
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
|
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
|
||||||
days = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
|
days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
|
||||||
localtime = time.localtime()
|
localtime = time.localtime()
|
||||||
year = int(time.strftime('%Y', localtime))
|
year = int(time.strftime('%Y', localtime))
|
||||||
month = int(time.strftime('%m', localtime))
|
month = int(time.strftime('%m', localtime))
|
||||||
day = int(time.strftime('%d', localtime))
|
day = int(time.strftime('%d', localtime)) - 1
|
||||||
for y in range(year, 1999, -1):
|
for y in range(year, 1999, -1):
|
||||||
if y == year:
|
if y == year:
|
||||||
mMax = month
|
mMax = month
|
||||||
else:
|
else:
|
||||||
mMax = m
|
mMax = 12
|
||||||
for m in range(mMax, 0, -1):
|
for m in range(mMax, 0, -1):
|
||||||
if y == year and m == month:
|
if y == year and m == month:
|
||||||
dMax = day
|
dMax = day
|
||||||
|
elif m == 2 and y % 4 == 0 and y % 400 != 0:
|
||||||
|
dMax = days[m] + 1
|
||||||
else:
|
else:
|
||||||
dMax = days[m]
|
dMax = days[m]
|
||||||
for d in range(dMax, 0, -1):
|
for d in range(dMax, 0, -1):
|
||||||
|
print 'getNews(%d, %d, %d)' % (y, m, d)
|
||||||
news = getNews(y, m ,d)
|
news = getNews(y, m ,d)
|
||||||
for new in news:
|
for new in news:
|
||||||
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
||||||
if not os.path.exists(dirname):
|
if not os.path.exists(dirname):
|
||||||
os.makedirs(dirname)
|
os.makedirs(dirname)
|
||||||
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
|
if new['url'][-5:] == '.html':
|
||||||
|
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
|
||||||
|
else:
|
||||||
|
filename = dirname + '/' + new['url'] + '.json'
|
||||||
if not os.path.exists(filename) or True:
|
if not os.path.exists(filename) or True:
|
||||||
data = simplejson.dumps(new, ensure_ascii = False)
|
data = simplejson.dumps(new, ensure_ascii = False)
|
||||||
f = open(filename, 'w')
|
f = open(filename, 'w')
|
||||||
f.write(data)
|
f.write(data)
|
||||||
f.close()
|
f.close()
|
||||||
filename = filename[:-4] + 'txt'
|
filename = filename[:-5] + '.txt'
|
||||||
if not os.path.exists(filename) or True:
|
if not os.path.exists(filename) or True:
|
||||||
data = splitTitle(new['title'])
|
data = splitTitle(new['title'])
|
||||||
data.append(new['description'])
|
data.append(new['description'])
|
||||||
|
@ -215,6 +246,26 @@ def archiveNews():
|
||||||
f.write(data)
|
f.write(data)
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
strings = new['url'].split('/')
|
||||||
|
string = strings[3]
|
||||||
|
if len(strings) == 6:
|
||||||
|
string += '/' + strings[4]
|
||||||
|
if not count.has_key(string):
|
||||||
|
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
|
||||||
|
else:
|
||||||
|
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
|
||||||
|
strings = splitTitle(new['title'])
|
||||||
|
if strings[0] != new['title1'] or strings[1] != new['title2']:
|
||||||
|
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
|
||||||
|
for key in sortDictByKey(count):
|
||||||
|
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
|
||||||
|
for value in colon:
|
||||||
|
print value
|
||||||
|
|
||||||
|
def sortDictByKey(d):
|
||||||
|
keys = d.keys()
|
||||||
|
keys.sort()
|
||||||
|
return keys
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# spiegel = Spiegel(2008, 8)
|
# spiegel = Spiegel(2008, 8)
|
||||||
|
@ -235,5 +286,5 @@ if __name__ == '__main__':
|
||||||
x.append(string)
|
x.append(string)
|
||||||
print x
|
print x
|
||||||
'''
|
'''
|
||||||
archiveMagazines()
|
# archiveIssues()
|
||||||
archiveNews()
|
archiveNews()
|
Loading…
Reference in a new issue