diff --git a/oilarchive/controllers.py b/oilarchive/controllers.py index da3c60b..bd61745 100644 --- a/oilarchive/controllers.py +++ b/oilarchive/controllers.py @@ -70,6 +70,7 @@ class Admin: archiveUrl = data['archiveUrl'], ttl = int(data['ttl']), ) + new.setHashId() raise redirect('archives') @expose('.templates.admin_sortnames') diff --git a/oilarchive/cronjobs.py b/oilarchive/cronjobs.py index 402130c..3dfea62 100644 --- a/oilarchive/cronjobs.py +++ b/oilarchive/cronjobs.py @@ -19,8 +19,9 @@ def updateSortAuthorNames(): grab new input from archives ''' def spiderArchives(): - for archive in Archives.select(): + for archive in Archive.select(Archive.q.initialized == True): if archive.pubDate - datetime.now() < timedelta(minutes = archive.ttl): + print archive.archiveName archive.update() diff --git a/oilarchive/model.py b/oilarchive/model.py index f170db6..f9993f8 100644 --- a/oilarchive/model.py +++ b/oilarchive/model.py @@ -51,7 +51,7 @@ class ArchiveItem(SQLObject): authorSort = UnicodeCol(default = '') description = UnicodeCol() # text(for rss) html = UnicodeCol() #(for page, contains javascript) - text = UnicodeCol() #Fulltext + text = UnicodeCol(length = 2**25) #Fulltext relDate = DateTimeCol() #timestamp (item released) pubDate = DateTimeCol() #timestamp (item published) modDate = DateTimeCol() #timestamp (item published) @@ -137,6 +137,9 @@ class Archive(SQLObject): hashId = UnicodeCol(alternateID = True, length=128) + def setHashId(self): + self.hashId = md5.new("%s" % self.id).hexdigest() + def _get_pubDateTimestamp(self): if self.initialized: return int(time.mktime(self.pubDate.timetuple())) @@ -168,19 +171,24 @@ class Archive(SQLObject): def update(self): result = simplejson.loads(read_url(self.files_url)) - if result.has_key('css'): + if result and result.has_key('css'): self.css = read_url(self.full_url(result['css'])) else: self.css = '' - if result.has_key('js'): + if result and result.has_key('js'): self.js = read_url(self.full_url(result['js'])) else: self.js = '' result = simplejson.loads(read_url(self.update_url)) items = result.get('items', []) + print len(items) for id in items: - print "updating / adding ", id - data = jsonLoadArchiveItem(read_url(self.data_url(id))) + try: + data = read_url(self.data_url(id)) + data = jsonLoadArchiveItem(data) + except: + print "failed to load ", id, "from ", self.data_url(id) + continue q = ArchiveItem.select(AND( ArchiveItem.q.archiveItemId == id, ArchiveItem.q.archiveID == self.id)) diff --git a/start-oilarchive.py b/start-oilarchive.py index bc0c7de..c2bdae7 100755 --- a/start-oilarchive.py +++ b/start-oilarchive.py @@ -2,12 +2,13 @@ import pkg_resources pkg_resources.require("TurboGears") -from turbogears import update_config, start_server +from turbogears import update_config, start_server, scheduler import cherrypy cherrypy.lowercase_api = True from os.path import * import sys + # first look on the command line for a desired config file, # if it's not on the command line, then # look for setup.py in this directory. If it's not there, this script is @@ -21,5 +22,11 @@ else: update_config(configfile="prod.cfg",modulename="oilarchive.config") from oilarchive.controllers import Root +from oilarchive import cronjobs +scheduler.add_interval_task( + action=cronjobs.runCron, taskname='cronoil', + initialdelay=10, interval=60, + processmethod=scheduler.method.forked) + start_server(Root())