From d04877e1a2ceb9e50c7cc284fe15d9e86f6ddc11 Mon Sep 17 00:00:00 2001
From: Rolux <rolux@Rolux.local>
Date: Wed, 7 May 2008 10:15:25 +0200
Subject: [PATCH] itunes.py: getting rid of findString()

---
 ox/itunes.py  |  39 +++++++++---------
 ox/spiegel.py | 107 +++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 99 insertions(+), 47 deletions(-)
diff --git a/ox/itunes.py b/ox/itunes.py
index 7e94403..da9c6b3 100644
--- a/ox/itunes.py
+++ b/ox/itunes.py
@@ -2,6 +2,8 @@ import re
 import urllib
 
 from oxutils.cache import getUrl
+from oxutils.html import decodeHtml, stripTags
+from oxutils.text import findRe
 from oxutils.text import findString
 
 # to sniff itunes traffic, use something like
@@ -42,17 +44,16 @@ def parseXmlDict(xml):
   strings = xml.split('<key>')
   for string in strings:
     if string.find('</key>') != -1:
-      key = findString(string, '', '</key>')
-      type = findString(string, '</key><', '>')
+      key = findRe(string, '(.*?)</key>')
+      type = findRe(string, '</key><(.*?)>')
       if type == 'true/':
         value = True
       else:
-        value = findString(string, '<%s>' % type, '</%s>' % type)
+        value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
         if type == 'integer':
           value = int(value)
         elif type == 'string':
-          value = value.replace('&#38;', '&')
-          value = value.replace('&#39;', '\'')
+          value = decodeHtml(value)
       values[key] = value
   return values
 
@@ -65,28 +66,28 @@ class ItunesAlbum:
   def getId(self):
     url = composeUrl('advancedSearch', {'title': self.title, 'artist': self.artist})
     xml = getUrl(url, None, ITUNES_HEADERS)
-    id = findString(xml, 'viewAlbum?id=', '&')
+    id = findRe(xml, 'viewAlbum\?id=(.*?)&')
     return id
 
   def getData(self):
     data = {'id': self.id}
     url = composeUrl('viewAlbum', {'id': self.id})
     xml = getUrl(url, None, ITUNES_HEADERS)
-    xml = findString(xml, '<View>')
-    data['albumName'] = findString(xml, '<B>', '<')
-    data['artistName'] = findString(xml, '<b>', '<')
-    data['coverUrl'] = findString(xml, 'reflection="1" url="', '"')
-    data['genre'] = findString(xml, 'Genre: ', '<')
-    data['releaseDate'] = findString(xml, 'Released', '<')
-    data['review'] = findString(findString(xml, 'REVIEW</b>'), '<SetFontStyle normalStyle="textColor">', '</SetFontStyle>')
+    data['albumName'] = findRe(xml, '<B>(.*?)</B>')
+    data['artistName'] = findRe(xml, '<b>(.*?)</b>')
+    data['coverUrl'] = findRe(xml, 'reflection="1" url="(.*?)"')
+    data['genre'] = findRe(xml, 'Genre:(.*?)<')
+    data['releaseDate'] = findRe(xml, 'Released(.*?)<')
+    data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
     data['tracks'] = []
-    string = findString(findString(xml, '<key>items</key>', '</array>'), '<dict>')
-    strings = string.split('<dict>')
+    strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
     for string in strings:
-        data['tracks'].append(parseXmlDict(string))
-    data['type'] = findString(xml, '<key>listType</key><string>', '<')
+      data['tracks'].append(parseXmlDict(string))
+    data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
     return data
 
 if __name__ == '__main__':
-  test = ItunesAlbum('So Red the Rose', 'Arcadia')
-  print test.getData()
\ No newline at end of file
+  import simplejson
+  data = ItunesAlbum('So Red the Rose', 'Arcadia').getData()
+  print simplejson.dumps(data, sort_keys = True, indent = 4)
+  # print test.getData()
\ No newline at end of file
diff --git a/ox/spiegel.py b/ox/spiegel.py
index 49481ef..492171e 100644
--- a/ox/spiegel.py
+++ b/ox/spiegel.py
@@ -26,7 +26,7 @@ def getNews(year, month, day):
         for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
             dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
             try:
-                description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip()
+                description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
             except:
                 description = ''
             try:
@@ -34,7 +34,7 @@ def getNews(year, month, day):
             except:
                 imageUrl = ''
             try:
-                title = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
+                title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
             except:
                 title = ''
             if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
@@ -49,21 +49,30 @@ def getNews(year, month, day):
                 new['imageUrl'] = imageUrl
                 new['section'] = formatSection(section)
                 new['title'] = formatString(title)
-                new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
+                new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
+                if new['title1'][-1:] == ':':
+                    new['title1'] = new['title1'][0:-1]
+                new['title2'] = new['title'][len(new['title1']) + 2:]
+                new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
+                if new['url'][:1] == '/':
+                    new['url'] = 'http://www.spiegel.de' + new['url']
                 news.append(new)
-                print dateString + ' - ok'
+                # print '%s, %s' % (new['section'], dateString)
+            '''
             elif dateString[:10] == date and not description:
                 print dateString + ' - no description'
             elif dateString[:10] == date and not imageUrl:
                 print dateString + ' - no image'
+            '''
     return news
 
 def splitTitle(title):
-    title0 = re.compile('(.*?): ').findall(title)[0]
-    title1 = re.compile(': (.*?)$').findall(title)[0]
-    return [title0, title1]
+    title1 = re.compile('(.*?): ').findall(title)[0]
+    title2 = re.compile(': (.*?)$').findall(title)[0]
+    return [title1, title2]
 
 def formatString(string):
+    string = string.replace('<span class="spOptiBreak"> </span>', '')
     string = string.replace('\n', ' ').replace('  ', ' ').strip()
     string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
     return string
@@ -72,6 +81,7 @@ def formatSection(string):
     return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
 
 def formatSubsection(string):
+    # SPIEGEL, SPIEGEL special
     subsection = {
         'abi': 'Abi - und dann?',
         'formel1': 'Formel 1',
@@ -84,10 +94,10 @@ def formatSubsection(string):
         'wunderbar': 'wunderBAR'
     }
     if subsection.has_key(string):
-        return subsection[string]
+        return subsection[string].replace(u'\xc3', 'ae')
     return string[:1].upper() + string[1:]
         
-def getMagazine(year, week):
+def getIssue(year, week):
     coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
     if not oxutils.net.exists(coverUrl):
         return None
@@ -103,17 +113,18 @@ def getMagazine(year, week):
     pages = page + 2
     for page in range(1, pages + 10):
         url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
-        if oxutils.net.exists(url):
+        if oxutils.cache.exists(url):
             pageUrl[page] = url
         else:
             pageUrl[page] = ''
     return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
 
 
-def archiveMagazines():
+def archiveIssues():
     '''
     this is just an example of an archiving application
     '''
+    p = {}
     import os
     import simplejson
     import time
@@ -127,22 +138,22 @@ def archiveMagazines():
         else:
             wMax = 53
         for w in range(wMax, 0, -1):
-            print '%2d/%d' % (w, y)
-            magazine = getMagazine(y, w)
-            if magazine:
+            print 'getIssue(%d, %d)' % (y, w)
+            issue = getIssue(y, w)
+            if issue:
                 dirname = '%s/%d/%02d' % (archivePath, y, w)
                 if not os.path.exists(dirname):
                     os.makedirs(dirname)
                 filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
-                if not os.path.exists(filename) or True:
-                    data = simplejson.dumps(magazine, ensure_ascii = False)
+                if not os.path.exists(filename):
+                    data = simplejson.dumps(issue, ensure_ascii = False)
                     f = open(filename, 'w')
                     f.write(data)
                     f.close()
                 filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
-                if not os.path.exists(filename) or True:
+                if not os.path.exists(filename):
                     data = []
-                    for item in magazine['contents']:
+                    for item in issue['contents']:
                         data.append('%3d %s' % (item['page'], item['title']))
                     data = '\n'.join(data)
                     f = open(filename, 'w')
@@ -150,12 +161,12 @@ def archiveMagazines():
                     f.close()
                 filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
                 if not os.path.exists(filename):
-                    data = oxutils.cache.getUrl(magazine['coverUrl'])
+                    data = oxutils.cache.getUrl(issue['coverUrl'])
                     f = open(filename, 'w')
                     f.write(data)
                     f.close()
-                for page in magazine['pageUrl']:
-                    url = magazine['pageUrl'][page]
+                for page in issue['pageUrl']:
+                    url = issue['pageUrl'][page]
                     if url:
                         filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
                         if not os.path.exists(filename):
@@ -163,7 +174,17 @@ def archiveMagazines():
                             f = open(filename, 'w')
                             f.write(data)
                             f.close()
-                
+                if not p:
+                    p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
+                else:
+                    p['num'] += 1
+                    p['sum'] += issue['pages']
+                    if issue['pages'] < p['min']:
+                        p['min'] = issue['pages']
+                    if issue['pages'] > p['max']:
+                        p['max'] = issue['pages']
+                print p['min'], p['sum'] / p['num'], p['max']
+            
 
 def archiveNews():
     '''
@@ -172,35 +193,45 @@ def archiveNews():
     import os
     import simplejson
     import time
+
+    count = {}
+    colon = []
+
     archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
-    days = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+    days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
     localtime = time.localtime()
     year = int(time.strftime('%Y', localtime))
     month = int(time.strftime('%m', localtime))
-    day = int(time.strftime('%d', localtime))
+    day = int(time.strftime('%d', localtime)) - 1
     for y in range(year, 1999, -1):
         if y == year:
             mMax = month
         else:
-            mMax = m 
+            mMax = 12
         for m in range(mMax, 0, -1):
             if y == year and m == month:
                 dMax = day
+            elif m == 2 and y % 4 == 0 and y % 400 != 0:
+                dMax = days[m] + 1
             else:
                 dMax = days[m]
             for d in range(dMax, 0, -1):
+                print 'getNews(%d, %d, %d)' % (y, m, d)
                 news = getNews(y, m ,d)
                 for new in news:
                     dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
                     if not os.path.exists(dirname):
                         os.makedirs(dirname)
-                    filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
+                    if new['url'][-5:] == '.html':
+                        filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
+                    else:
+                        filename = dirname + '/' + new['url'] + '.json'
                     if not os.path.exists(filename) or True:
                         data = simplejson.dumps(new, ensure_ascii = False)
                         f = open(filename, 'w')
                         f.write(data)
                         f.close()
-                    filename = filename[:-4] + 'txt'
+                    filename = filename[:-5] + '.txt'
                     if not os.path.exists(filename) or True:
                         data = splitTitle(new['title'])
                         data.append(new['description'])
@@ -215,6 +246,26 @@ def archiveNews():
                         f.write(data)
                         f.close()
 
+                    strings = new['url'].split('/')
+                    string = strings[3]
+                    if len(strings) == 6:
+                        string += '/' + strings[4]
+                    if not count.has_key(string):
+                        count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
+                    else:
+                        count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
+                    strings = splitTitle(new['title'])
+                    if strings[0] != new['title1'] or strings[1] != new['title2']:
+                        colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
+            for key in sortDictByKey(count):
+                print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
+            for value in colon:
+                print value
+
+def sortDictByKey(d):
+    keys = d.keys()
+    keys.sort()
+    return keys
 
 if __name__ == '__main__':
     # spiegel = Spiegel(2008, 8)
@@ -235,5 +286,5 @@ if __name__ == '__main__':
                 x.append(string)
         print x
     '''
-    archiveMagazines()
+    # archiveIssues()
     archiveNews()
\ No newline at end of file