- spider can read archives now

- items are indexed and queryArchive sort of works
  items get a socre element
- port some sort / session things from oxdb
- transparent png reflections
This commit is contained in:
j 2007-03-28 21:26:58 +00:00
commit 0d3592374d
8 changed files with 408 additions and 65 deletions

View file

@ -3,61 +3,112 @@
# vi:si:et:sw=2:sts=2:ts=2
from datetime import datetime
import time
from urllib import quote
import md5
from turbogears.database import PackageHub
from sqlobject import *
from turbogears import identity
from scrapeit import read_url
from turbojson.jsonify import jsonify_sqlobject
import MySQLdb
from sqlobject import *
from scrapeit.utils import read_url
import simplejson
from oilspider import jsonLoadArchiveItem, jsonPrepareArchiveItem, jsonImportArchiveItem
from oilspider import jsonLoadArchiveItem, jsonImportArchiveItem
hub = PackageHub("oilarchive")
__connection__ = hub
def queryArchive(query, orderBy="score", offset = 0, count = 100):
query = MySQLdb.escape_string(query)
match = "MATCH (title, description, text) AGAINST ('%s')" % query
sql = """SELECT id, %s AS score FROM archive_item
WHERE %s ORDER BY %s""" % \
(match, match, orderBy) #, offset, count)
result = []
matches = ArchiveItem._connection.queryAll(sql)
if len(matches) > offset:
matches = matches[offset:]
if len(matches) > count:
matches = matches[:count]
for m in matches:
item = ArchiveItem.get(m[0])
item.score = m[1]
result.append(item)
return result
class ArchiveItem(SQLObject):
hashId = UnicodeCol(alternateID = True, length=128)
archiveId = UnicodeCol()
archiveItemId = UnicodeCol()
icon = UnicodeCol() # -> url (128x128)
title = UnicodeCol()
description = UnicodeCol()
titleSort = UnicodeCol(default = '')
author = UnicodeCol()
authorSort = UnicodeCol(default = '')
description = UnicodeCol() # text(for rss)
html = UnicodeCol() #(for page, contains javascript)
text = UnicodeCol() #Fulltext
url = UnicodeCol()
downloadURL = UnicodeCol()
icon = UnicodeCol()
releaseDate = DateTimeCol()
pubDate = DateTimeCol()
size = IntCol()
rights = IntCol() #-> int: 0 (free) - 5 (unfree)
archiveName = UnicodeCol()
archiveType = UnicodeCol()
relDate = DateTimeCol() #timestamp (item released)
pubDate = DateTimeCol() #timestamp (item published)
modDate = DateTimeCol() #timestamp (item published)
archiveUrl = UnicodeCol() # -> url (link to archive page)
downloadUrl = UnicodeCol() # -> url (link to item)
size = IntCol() #bytes
rights = IntCol(default = 5) #-> int: 0 (free) - 5 (unfree)
itemType = UnicodeCol() #string (Text, Pictures, Music, Movies, Software)
genre = UnicodeCol(default = '')
archive = ForeignKey('Archive')
created = DateTimeCol(default=datetime.now)
#score is only available if loaded via queryArchive
score = -1
#Fulltext search
#ALTER TABLE archive_item ADD FULLTEXT (title, description, text);
def _set_author(self, value):
self._SO_set_author(value)
if not self.author_sort:
self.author_sort = value
if not self.authorSort:
self.authorSort = value
def _get_year(self):
return self.releaseDate.strftime('%Y')
return self.relDate.strftime('%Y')
def _get_json(self):
result = jsonify_sqlobject(self)
result['relDate'] = self.relDate.strftime('%s')
result['pubDate'] = self.pubDate.strftime('%s')
return result
'''
return dict(
title = self.title,
description = self.description,
html = self.html,
text = self.text,
author = self.author,
url = self.url,
icon = '/view/%s/icon' % self.hash,
releaseDate = self.releaseDate,
pubDate = self.pubDate,
archiveUrl = self.archiveUrl,
downloadUrl = self.downloadUrl,
size = self.size,
icon = '/view/%s/icon.png' % self.hash,
relDate = self.relDate.strftime('%s'),
pubDate = self.pubDate.strftime('%s'),
size = self.size,
)
'''
def update(self, data):
for key in data:
setattr(self, key, values[key])
setattr(self, key, data[key])
self.updateHashID()
def updateHashID(self):
salt = '%s/%s/%s' % (self.archive.archiveName, self.author, self.title)
self.hashID = md5.new(salt).hexdigest()
class Archive(SQLObject):
@ -66,27 +117,33 @@ class Archive(SQLObject):
archiveType = UnicodeCol(default=u'')
ttl = IntCol(default = "15")
pubDate = DateTimeCol(default=datetime.now)
modDate = DateTimeCol(default=datetime.now)
created = DateTimeCol(default=datetime.now)
def _get_pubDateTimestamp(self):
return time.mktime(self.pubDate.timetuple())
return int(time.mktime(self.pubDate.timetuple()))
def _query_url(self, query):
url = "%s?" % self.archiveUrl
url += "&".join(["%s=%s" % (key, quote("%s" % query[key])) for key in query])
return url
def _get_update_url(self):
return "%s?pubDate=%s" % (self.archiveUrl, self.pubDateTimestamp)
return self._query_url({'modDate': self.pubDateTimestamp})
def data_url(self, id):
return "%s?id=%s" % (self.archiveUrl, id)
return self._query_url({'id': id})
def update(self):
result = simplejson.loads(read_url(self.update_url))
for id in result:
items = result.get('items', [])
for id in items:
data = jsonLoadArchiveItem(read_url(self.data_url(id)))
q = ArchiveItem.select(AND(
ArchiveItem.q.ArchiveId == id,
ArchiveItem.q.ArchiveName == self.ArchiveName))
ArchiveItem.q.archiveItemId == id,
ArchiveItem.q.archiveID == self.id))
if q.count() == 0:
data = jsonPrepareArchiveItem(id, data)
jsonImportArchiveItem(data)
jsonImportArchiveItem(self, id, data)
else:
q[0].update(data)