merge changes from mars, itunes, spiegel

2008-05-07 11:50:09 +02:00 · 2008-05-07 11:50:09 +02:00 · 82a1dca6c3
commit 82a1dca6c3
parent be45b75845 d04877e1a2
2 changed files with 99 additions and 47 deletions
--- a/ox/itunes.py
+++ b/ox/itunes.py
@ -2,6 +2,8 @@ import re
 import urllib

 from oxutils.cache import getUrl
+from oxutils.html import decodeHtml, stripTags
+from oxutils.text import findRe
 from oxutils.text import findString

 # to sniff itunes traffic, use something like
@ -42,17 +44,16 @@ def parseXmlDict(xml):
  strings = xml.split('<key>')
  for string in strings:
    if string.find('</key>') != -1:
-      key = findString(string, '', '</key>')
-      type = findString(string, '</key><', '>')
+      key = findRe(string, '(.*?)</key>')
+      type = findRe(string, '</key><(.*?)>')
      if type == 'true/':
        value = True
      else:
-        value = findString(string, '<%s>' % type, '</%s>' % type)
+        value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
        if type == 'integer':
          value = int(value)
        elif type == 'string':
-          value = value.replace('&#38;', '&')
-          value = value.replace('&#39;', '\'')
+          value = decodeHtml(value)
      values[key] = value
  return values

@ -65,28 +66,28 @@ class ItunesAlbum:
  def getId(self):
    url = composeUrl('advancedSearch', {'title': self.title, 'artist': self.artist})
    xml = getUrl(url, None, ITUNES_HEADERS)
-    id = findString(xml, 'viewAlbum?id=', '&')
+    id = findRe(xml, 'viewAlbum\?id=(.*?)&')
    return id

  def getData(self):
    data = {'id': self.id}
    url = composeUrl('viewAlbum', {'id': self.id})
    xml = getUrl(url, None, ITUNES_HEADERS)
-    xml = findString(xml, '<View>')
-    data['albumName'] = findString(xml, '<B>', '<')
-    data['artistName'] = findString(xml, '<b>', '<')
-    data['coverUrl'] = findString(xml, 'reflection="1" url="', '"')
-    data['genre'] = findString(xml, 'Genre: ', '<')
-    data['releaseDate'] = findString(xml, 'Released', '<')
-    data['review'] = findString(findString(xml, 'REVIEW</b>'), '<SetFontStyle normalStyle="textColor">', '</SetFontStyle>')
+    data['albumName'] = findRe(xml, '<B>(.*?)</B>')
+    data['artistName'] = findRe(xml, '<b>(.*?)</b>')
+    data['coverUrl'] = findRe(xml, 'reflection="1" url="(.*?)"')
+    data['genre'] = findRe(xml, 'Genre:(.*?)<')
+    data['releaseDate'] = findRe(xml, 'Released(.*?)<')
+    data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
    data['tracks'] = []
-    string = findString(findString(xml, '<key>items</key>', '</array>'), '<dict>')
-    strings = string.split('<dict>')
+    strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
    for string in strings:
-        data['tracks'].append(parseXmlDict(string))
-    data['type'] = findString(xml, '<key>listType</key><string>', '<')
+      data['tracks'].append(parseXmlDict(string))
+    data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
    return data

 if __name__ == '__main__':
-  test = ItunesAlbum('So Red the Rose', 'Arcadia')
-  print test.getData()
+  import simplejson
+  data = ItunesAlbum('So Red the Rose', 'Arcadia').getData()
+  print simplejson.dumps(data, sort_keys = True, indent = 4)
+  # print test.getData()
--- a/ox/spiegel.py
+++ b/ox/spiegel.py
@ -26,7 +26,7 @@ def getNews(year, month, day):
        for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
            dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
            try:
-                description = re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0].strip()
+                description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
            except:
                description = ''
            try:
@ -34,7 +34,7 @@ def getNews(year, month, day):
            except:
                imageUrl = ''
            try:
-                title = formatString(re.compile('title=[\'|"](.*?)[\'|"] />', re.DOTALL).findall(item)[0])
+                title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
            except:
                title = ''
            if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
@ -49,21 +49,30 @@ def getNews(year, month, day):
                new['imageUrl'] = imageUrl
                new['section'] = formatSection(section)
                new['title'] = formatString(title)
-                new['url'] = 'http://www.spiegel.de' + re.compile('<a href="(.*?)"').findall(item)[0]
+                new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
+                if new['title1'][-1:] == ':':
+                    new['title1'] = new['title1'][0:-1]
+                new['title2'] = new['title'][len(new['title1']) + 2:]
+                new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
+                if new['url'][:1] == '/':
+                    new['url'] = 'http://www.spiegel.de' + new['url']
                news.append(new)
-                print dateString + ' - ok'
+                # print '%s, %s' % (new['section'], dateString)
+            '''
            elif dateString[:10] == date and not description:
                print dateString + ' - no description'
            elif dateString[:10] == date and not imageUrl:
                print dateString + ' - no image'
+            '''
    return news

 def splitTitle(title):
-    title0 = re.compile('(.*?): ').findall(title)[0]
-    title1 = re.compile(': (.*?)$').findall(title)[0]
-    return [title0, title1]
+    title1 = re.compile('(.*?): ').findall(title)[0]
+    title2 = re.compile(': (.*?)$').findall(title)[0]
+    return [title1, title2]

 def formatString(string):
+    string = string.replace('<span class="spOptiBreak"> </span>', '')
    string = string.replace('\n', ' ').replace('  ', ' ').strip()
    string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
    return string
@ -72,6 +81,7 @@ def formatSection(string):
    return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')

 def formatSubsection(string):
+    # SPIEGEL, SPIEGEL special
    subsection = {
        'abi': 'Abi - und dann?',
        'formel1': 'Formel 1',
@ -84,10 +94,10 @@ def formatSubsection(string):
        'wunderbar': 'wunderBAR'
    }
    if subsection.has_key(string):
-        return subsection[string]
+        return subsection[string].replace(u'\xc3', 'ae')
    return string[:1].upper() + string[1:]
        
-def getMagazine(year, week):
+def getIssue(year, week):
    coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
    if not oxutils.net.exists(coverUrl):
        return None
@ -103,17 +113,18 @@ def getMagazine(year, week):
    pages = page + 2
    for page in range(1, pages + 10):
        url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
-        if oxutils.net.exists(url):
+        if oxutils.cache.exists(url):
            pageUrl[page] = url
        else:
            pageUrl[page] = ''
    return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}


-def archiveMagazines():
+def archiveIssues():
    '''
    this is just an example of an archiving application
    '''
+    p = {}
    import os
    import simplejson
    import time
@ -127,22 +138,22 @@ def archiveMagazines():
        else:
            wMax = 53
        for w in range(wMax, 0, -1):
-            print '%2d/%d' % (w, y)
-            magazine = getMagazine(y, w)
-            if magazine:
+            print 'getIssue(%d, %d)' % (y, w)
+            issue = getIssue(y, w)
+            if issue:
                dirname = '%s/%d/%02d' % (archivePath, y, w)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
-                if not os.path.exists(filename) or True:
-                    data = simplejson.dumps(magazine, ensure_ascii = False)
+                if not os.path.exists(filename):
+                    data = simplejson.dumps(issue, ensure_ascii = False)
                    f = open(filename, 'w')
                    f.write(data)
                    f.close()
                filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
-                if not os.path.exists(filename) or True:
+                if not os.path.exists(filename):
                    data = []
-                    for item in magazine['contents']:
+                    for item in issue['contents']:
                        data.append('%3d %s' % (item['page'], item['title']))
                    data = '\n'.join(data)
                    f = open(filename, 'w')
@ -150,12 +161,12 @@ def archiveMagazines():
                    f.close()
                filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
                if not os.path.exists(filename):
-                    data = oxutils.cache.getUrl(magazine['coverUrl'])
+                    data = oxutils.cache.getUrl(issue['coverUrl'])
                    f = open(filename, 'w')
                    f.write(data)
                    f.close()
-                for page in magazine['pageUrl']:
-                    url = magazine['pageUrl'][page]
+                for page in issue['pageUrl']:
+                    url = issue['pageUrl'][page]
                    if url:
                        filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
                        if not os.path.exists(filename):
@ -163,6 +174,16 @@ def archiveMagazines():
                            f = open(filename, 'w')
                            f.write(data)
                            f.close()
+                if not p:
+                    p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
+                else:
+                    p['num'] += 1
+                    p['sum'] += issue['pages']
+                    if issue['pages'] < p['min']:
+                        p['min'] = issue['pages']
+                    if issue['pages'] > p['max']:
+                        p['max'] = issue['pages']
+                print p['min'], p['sum'] / p['num'], p['max']
            

 def archiveNews():
@ -172,35 +193,45 @@ def archiveNews():
    import os
    import simplejson
    import time
+
+    count = {}
+    colon = []
+
    archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
-    days = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+    days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    localtime = time.localtime()
    year = int(time.strftime('%Y', localtime))
    month = int(time.strftime('%m', localtime))
-    day = int(time.strftime('%d', localtime))
+    day = int(time.strftime('%d', localtime)) - 1
    for y in range(year, 1999, -1):
        if y == year:
            mMax = month
        else:
-            mMax = m 
+            mMax = 12
        for m in range(mMax, 0, -1):
            if y == year and m == month:
                dMax = day
+            elif m == 2 and y % 4 == 0 and y % 400 != 0:
+                dMax = days[m] + 1
            else:
                dMax = days[m]
            for d in range(dMax, 0, -1):
+                print 'getNews(%d, %d, %d)' % (y, m, d)
                news = getNews(y, m ,d)
                for new in news:
                    dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
                    if not os.path.exists(dirname):
                        os.makedirs(dirname)
-                    filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
+                    if new['url'][-5:] == '.html':
+                        filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
+                    else:
+                        filename = dirname + '/' + new['url'] + '.json'
                    if not os.path.exists(filename) or True:
                        data = simplejson.dumps(new, ensure_ascii = False)
                        f = open(filename, 'w')
                        f.write(data)
                        f.close()
-                    filename = filename[:-4] + 'txt'
+                    filename = filename[:-5] + '.txt'
                    if not os.path.exists(filename) or True:
                        data = splitTitle(new['title'])
                        data.append(new['description'])
@ -215,6 +246,26 @@ def archiveNews():
                        f.write(data)
                        f.close()

+                    strings = new['url'].split('/')
+                    string = strings[3]
+                    if len(strings) == 6:
+                        string += '/' + strings[4]
+                    if not count.has_key(string):
+                        count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
+                    else:
+                        count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
+                    strings = splitTitle(new['title'])
+                    if strings[0] != new['title1'] or strings[1] != new['title2']:
+                        colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
+            for key in sortDictByKey(count):
+                print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
+            for value in colon:
+                print value
+
+def sortDictByKey(d):
+    keys = d.keys()
+    keys.sort()
+    return keys

 if __name__ == '__main__':
    # spiegel = Spiegel(2008, 8)
@ -235,5 +286,5 @@ if __name__ == '__main__':
                x.append(string)
        print x
    '''
-    archiveMagazines()
+    # archiveIssues()
    archiveNews()