oilarchive/oilarchive/model.py

406 lines
12 KiB
Python

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
from datetime import datetime
import time
from urllib import quote
import urlparse
import md5
from turbogears.database import PackageHub
from turbogears import identity
from turbojson.jsonify import jsonify_sqlobject
import MySQLdb
from sqlobject import *
from scrapeit.utils import read_url
import simplejson
from oilspider import jsonLoadArchiveItem, jsonImportArchiveItem
import utils
hub = PackageHub("oilarchive")
__connection__ = hub
def queryArchive(query, orderBy="score", offset = 0, count = 100):
query = MySQLdb.escape_string(query)
orderBy = orderBy.encode('utf-8')
print orderBy
if orderBy not in ('score', 'size', 'title', 'description'):
orderBy = 'score DESC, title'
if orderBy == 'size':
orderBy = "size DESC"
match = '''MATCH (title, description, text) AGAINST ('%s')''' % query
match_b = '''MATCH (title, description, text) AGAINST ('%s' IN BOOLEAN MODE)''' % query
sql = """SELECT id, ((100000/LENGTH(text)) * %s) AS score, title, size, description FROM archive_item
WHERE %s ORDER BY %s""" % \
(match_b, match_b, orderBy) #, offset, count)
result = []
max_score= None
print sql
matches = ArchiveItem._connection.queryAll(sql)
if len(matches) > offset:
matches = matches[offset:]
if len(matches) > count:
matches = matches[:count]
for m in matches:
item = ArchiveItem.get(m[0])
if not max_score:
max_score = m[1] / 100
if max_score:
item.score = m[1] / max_score
else:
item.score = m[1]
result.append(item)
return result
class ArchiveItem(SQLObject):
#after creating manual changes to db neede:
'''
ALTER TABLE archive_item ADD FULLTEXT (title, description, text);
ALTER TABLE archive_item CHANGE size size bigint;
ALTER TABLE archive_item CHANGE html html longtext;
ALTER TABLE archive_item CHANGE description description longtext;
ALTER TABLE archive_item CHANGE text text longtext;
'''
hashId = UnicodeCol(alternateID = True, length=128)
archiveItemId = UnicodeCol()
icon = UnicodeCol() # -> url (128x128)
title = UnicodeCol()
titleSort = UnicodeCol(default = '')
author = UnicodeCol()
authorSort = UnicodeCol(default = '')
description = UnicodeCol() # text(for rss)
html = UnicodeCol(length = 2**25) #(for page, contains javascript)
text = UnicodeCol(length = 2**25) #Fulltext
relDate = DateTimeCol() #timestamp (item released)
pubDate = DateTimeCol() #timestamp (item published)
modDate = DateTimeCol() #timestamp (item published)
archiveUrl = UnicodeCol() # -> url (link to archive page)
downloadUrl = UnicodeCol() # -> url (link to item)
storeUrl = UnicodeCol() # -> url (link to store)
size = IntCol() #bytes
rightsLevel = IntCol(default = 5) #-> int: 0 (free) - 5 (unfree)
rightsText = UnicodeCol(default = '')
kind = UnicodeCol() #string (Text, Pictures, Music, Movies, Software)
fileType = UnicodeCol() #fileType (pdf, txt etc)
genre = UnicodeCol(default = '')
archive = ForeignKey('Archive')
created = DateTimeCol(default=datetime.now)
#score is only available if loaded via queryArchive
score = -1
def _get_filetype(self):
return self.downloadUrl.split('.')[-1].upper()
def _get_sizeFormated(self):
return utils.formatFileSize(self.size)
def getPreview(self, sort):
if sort == 'size':
return self.sizeFormated
if sort == 'relevance':
return "%d" % self.score
return self.relDateFormated
def _set_author(self, value):
self._SO_set_author(value)
if not self.authorSort:
self.authorSort = value
def _set_title(self, value):
self._SO_set_title(value)
if not self.titleSort:
self.titleSort = value
def _get_year(self):
return self.relDate.strftime('%Y')
def rightsLevelClass(self, level):
if level == self.rightsLevel:
return "rightsLevelActive"
return "rightsLevelInactive"
def _get_relDateFormated(self):
if self.kind in ('Movie', 'Book'):
return self.year
else:
return self.relDate.strftime('%Y-%m-%d')
def domain(self, url):
d = url.split('/')
if len(d) > 2:
return d[2].split('?')[0]
return url
#expand urls in case they are relative to the archive
def _get_archiveUrl(self):
return self.archive.full_url(self._SO_get_archiveUrl())
def _get_downloadUrl(self):
return self.archive.full_url(self._SO_get_downloadUrl())
def _get_icon(self):
return self.archive.full_url(self._SO_get_icon())
def _get_json(self):
result = jsonify_sqlobject(self)
result['relDate'] = self.relDate.strftime('%s')
result['pubDate'] = self.pubDate.strftime('%s')
result['modDate'] = self.relDate.strftime('%s')
return result
'''
return dict(
title = self.title,
description = self.description,
html = self.html,
text = self.text,
author = self.author,
archiveUrl = self.archiveUrl,
downloadUrl = self.downloadUrl,
size = self.size,
icon = '/view/%s/icon.png' % self.hash,
relDate = self.relDate.strftime('%s'),
pubDate = self.pubDate.strftime('%s'),
size = self.size,
)
'''
def update(self, data):
for key in data:
setattr(self, key, data[key])
self.setHashId()
def setHashId(self):
salt = u'%s/%s' % (self.archive.archiveName, self.archiveItemId)
self.hashID = md5.new(salt.encode('utf-8')).hexdigest()
def htmlHighlight(self, term):
return utils.highlightText(self.html, term)
class Archive(SQLObject):
archiveId = UnicodeCol(alternateID = True, length = 1000)
archiveName = UnicodeCol()
archiveUrl = UnicodeCol()
ttl = IntCol(default = "900") #seconds
pubDate = DateTimeCol(default=datetime.now)
modDate = DateTimeCol(default=datetime.now)
created = DateTimeCol(default=datetime.now)
initialized = BoolCol(default = False)
css = UnicodeCol(default='')
js = UnicodeCol(default='')
icon = UnicodeCol() # -> url (128x128)
hashId = UnicodeCol(alternateID = True, length=128)
def setHashId(self):
self.hashId = md5.new("%s" % self.id).hexdigest()
def _get_modDateTimestamp(self):
if self.initialized:
return int(time.mktime(self.modDate.timetuple()))
return -1
def _query_url(self, query):
url = "%s?" % self.archiveUrl
url += "&".join(["%s=%s" % (key, quote("%s" % query[key])) for key in query])
return url
def _get_update_url(self):
return self._query_url({'modDate': self.modDateTimestamp})
def _get_metadata_url(self):
return self._query_url({'metadata': '1'})
def data_url(self, id):
return self._query_url({'id': id})
def full_url(self, url):
if not url:
return ''
if url.find('://') > 0:
return url
if url.startswith('/'):
domain = "://".join(urlparse.urlsplit(self.archiveUrl)[0:2])
url = "%s%s" % (domain, url)
else:
url = "%s/%s" % (self.archiveUrl, url)
return url
def _get_iconUrl(self):
if self.icon:
return "/icon/%s.png" % self.hashId
else:
return "/static/images/iconCollection.png"
def update(self):
result = simplejson.loads(read_url(self.metadata_url))
if result:
if result.has_key('name'):
self.archiveName = result['name']
if result.has_key('id'):
self.archiveId = result['id']
if result.has_key('ttl'):
self.ttl = int(result['ttl'])
if result.has_key('icon'):
self.icon = result['icon']
if result.has_key('css'):
try:
data = read_url(self.full_url(result['css']))
self.css = data
except:
self.css = ''
if result.has_key('js'):
try:
data = read_url(self.full_url(result['js']))
self.js = data
except:
self.js = ''
else:
self.icon = ''
self.js = ''
self.css = ''
result = simplejson.loads(read_url(self.update_url))
items = result.get('items', [])
print "importing", len(items), "items"
for id in items:
try:
self.updateItem(id)
except:
print "failed to load ", id, "from ", self.data_url(id)
continue
self.initialized = True
self.modDate = datetime.now()
def updateItem(self, id):
data = read_url(self.data_url(id))
data = jsonLoadArchiveItem(data)
print data['title'].encode('utf-8')
q = ArchiveItem.select(AND(
ArchiveItem.q.archiveItemId == id,
ArchiveItem.q.archiveID == self.id))
if q.count() == 0:
jsonImportArchiveItem(self, id, data)
else:
q[0].update(data)
'''
get list of all items from archive and remove those from ArchiveItem that
are no longer in the list
'''
def cleanUp(self):
url = self._query_url({'modDate': -1})
result = simplejson.loads(read_url(url))
archiveItems = result.get('items', [])
archivedItems = {}
for i in ArchiveItem.select(ArchiveItem.q.archiveID == self.id):
archivedItems[i.archiveItemId] = i.id
removeItems = filter(lambda i: i not in archiveItems, archivedItems.keys())
for i in removeItems: ArchiveItem.delete(archivedItems[i])
class SortName(SQLObject):
name =UnicodeCol(length=1000, alternateID=True)
# identity models.
class Visit(SQLObject):
class sqlmeta:
table = "visit"
visit_key = StringCol(length=40, alternateID=True,
alternateMethodName="by_visit_key")
created = DateTimeCol(default=datetime.now)
expiry = DateTimeCol()
def lookup_visit(cls, visit_key):
try:
return cls.by_visit_key(visit_key)
except SQLObjectNotFound:
return None
lookup_visit = classmethod(lookup_visit)
class VisitIdentity(SQLObject):
visit_key = StringCol(length=40, alternateID=True,
alternateMethodName="by_visit_key")
user_id = IntCol()
class Group(SQLObject):
"""
An ultra-simple group definition.
"""
# names like "Group", "Order" and "User" are reserved words in SQL
# so we set the name to something safe for SQL
class sqlmeta:
table = "tg_group"
group_name = UnicodeCol(length=16, alternateID=True,
alternateMethodName="by_group_name")
display_name = UnicodeCol(length=255)
created = DateTimeCol(default=datetime.now)
# collection of all users belonging to this group
users = RelatedJoin("User", intermediateTable="user_group",
joinColumn="group_id", otherColumn="user_id")
# collection of all permissions for this group
permissions = RelatedJoin("Permission", joinColumn="group_id",
intermediateTable="group_permission",
otherColumn="permission_id")
class User(SQLObject):
"""
Reasonably basic User definition. Probably would want additional attributes.
"""
# names like "Group", "Order" and "User" are reserved words in SQL
# so we set the name to something safe for SQL
class sqlmeta:
table = "tg_user"
user_name = UnicodeCol(length=16, alternateID=True,
alternateMethodName="by_user_name")
email_address = UnicodeCol(length=255, alternateID=True,
alternateMethodName="by_email_address")
display_name = UnicodeCol(length=255)
password = UnicodeCol(length=40)
created = DateTimeCol(default=datetime.now)
# groups this user belongs to
groups = RelatedJoin("Group", intermediateTable="user_group",
joinColumn="user_id", otherColumn="group_id")
def _get_permissions(self):
perms = set()
for g in self.groups:
perms = perms | set(g.permissions)
return perms
def _set_password(self, cleartext_password):
"Runs cleartext_password through the hash algorithm before saving."
hash = identity.encrypt_password(cleartext_password)
self._SO_set_password(hash)
def set_password_raw(self, password):
"Saves the password as-is to the database."
self._SO_set_password(password)
class Permission(SQLObject):
permission_name = UnicodeCol(length=16, alternateID=True,
alternateMethodName="by_permission_name")
description = UnicodeCol(length=255)
groups = RelatedJoin("Group",
intermediateTable="group_permission",
joinColumn="permission_id",
otherColumn="group_id")