406 lines
12 KiB
Python
406 lines
12 KiB
Python
# -*- Mode: Python; -*-
|
|
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=2:sts=2:ts=2
|
|
|
|
from datetime import datetime
|
|
import time
|
|
from urllib import quote
|
|
import urlparse
|
|
import md5
|
|
|
|
from turbogears.database import PackageHub
|
|
from turbogears import identity
|
|
from turbojson.jsonify import jsonify_sqlobject
|
|
import MySQLdb
|
|
from sqlobject import *
|
|
|
|
from scrapeit.utils import read_url
|
|
import simplejson
|
|
|
|
from oilspider import jsonLoadArchiveItem, jsonImportArchiveItem
|
|
import utils
|
|
|
|
hub = PackageHub("oilarchive")
|
|
__connection__ = hub
|
|
|
|
|
|
def queryArchive(query, orderBy="score", offset = 0, count = 100):
|
|
query = MySQLdb.escape_string(query)
|
|
orderBy = orderBy.encode('utf-8')
|
|
print orderBy
|
|
if orderBy not in ('score', 'size', 'title', 'description'):
|
|
orderBy = 'score DESC, title'
|
|
if orderBy == 'size':
|
|
orderBy = "size DESC"
|
|
match = '''MATCH (title, description, text) AGAINST ('%s')''' % query
|
|
match_b = '''MATCH (title, description, text) AGAINST ('%s' IN BOOLEAN MODE)''' % query
|
|
|
|
sql = """SELECT id, ((100000/LENGTH(text)) * %s) AS score, title, size, description FROM archive_item
|
|
WHERE %s ORDER BY %s""" % \
|
|
(match_b, match_b, orderBy) #, offset, count)
|
|
result = []
|
|
max_score= None
|
|
print sql
|
|
matches = ArchiveItem._connection.queryAll(sql)
|
|
if len(matches) > offset:
|
|
matches = matches[offset:]
|
|
if len(matches) > count:
|
|
matches = matches[:count]
|
|
for m in matches:
|
|
item = ArchiveItem.get(m[0])
|
|
if not max_score:
|
|
max_score = m[1] / 100
|
|
if max_score:
|
|
item.score = m[1] / max_score
|
|
else:
|
|
item.score = m[1]
|
|
result.append(item)
|
|
return result
|
|
|
|
class ArchiveItem(SQLObject):
|
|
#after creating manual changes to db neede:
|
|
'''
|
|
ALTER TABLE archive_item ADD FULLTEXT (title, description, text);
|
|
ALTER TABLE archive_item CHANGE size size bigint;
|
|
ALTER TABLE archive_item CHANGE html html longtext;
|
|
ALTER TABLE archive_item CHANGE description description longtext;
|
|
ALTER TABLE archive_item CHANGE text text longtext;
|
|
'''
|
|
hashId = UnicodeCol(alternateID = True, length=128)
|
|
archiveItemId = UnicodeCol()
|
|
icon = UnicodeCol() # -> url (128x128)
|
|
title = UnicodeCol()
|
|
titleSort = UnicodeCol(default = '')
|
|
author = UnicodeCol()
|
|
authorSort = UnicodeCol(default = '')
|
|
description = UnicodeCol() # text(for rss)
|
|
html = UnicodeCol(length = 2**25) #(for page, contains javascript)
|
|
text = UnicodeCol(length = 2**25) #Fulltext
|
|
relDate = DateTimeCol() #timestamp (item released)
|
|
pubDate = DateTimeCol() #timestamp (item published)
|
|
modDate = DateTimeCol() #timestamp (item published)
|
|
archiveUrl = UnicodeCol() # -> url (link to archive page)
|
|
downloadUrl = UnicodeCol() # -> url (link to item)
|
|
storeUrl = UnicodeCol() # -> url (link to store)
|
|
size = IntCol() #bytes
|
|
rightsLevel = IntCol(default = 5) #-> int: 0 (free) - 5 (unfree)
|
|
rightsText = UnicodeCol(default = '')
|
|
kind = UnicodeCol() #string (Text, Pictures, Music, Movies, Software)
|
|
fileType = UnicodeCol() #fileType (pdf, txt etc)
|
|
genre = UnicodeCol(default = '')
|
|
|
|
archive = ForeignKey('Archive')
|
|
created = DateTimeCol(default=datetime.now)
|
|
|
|
#score is only available if loaded via queryArchive
|
|
score = -1
|
|
|
|
|
|
def _get_filetype(self):
|
|
return self.downloadUrl.split('.')[-1].upper()
|
|
|
|
def _get_sizeFormated(self):
|
|
return utils.formatFileSize(self.size)
|
|
|
|
def getPreview(self, sort):
|
|
if sort == 'size':
|
|
return self.sizeFormated
|
|
if sort == 'relevance':
|
|
return "%d" % self.score
|
|
return self.relDateFormated
|
|
|
|
def _set_author(self, value):
|
|
self._SO_set_author(value)
|
|
if not self.authorSort:
|
|
self.authorSort = value
|
|
|
|
def _set_title(self, value):
|
|
self._SO_set_title(value)
|
|
if not self.titleSort:
|
|
self.titleSort = value
|
|
|
|
def _get_year(self):
|
|
return self.relDate.strftime('%Y')
|
|
|
|
def rightsLevelClass(self, level):
|
|
if level == self.rightsLevel:
|
|
return "rightsLevelActive"
|
|
return "rightsLevelInactive"
|
|
def _get_relDateFormated(self):
|
|
if self.kind in ('Movie', 'Book'):
|
|
return self.year
|
|
else:
|
|
return self.relDate.strftime('%Y-%m-%d')
|
|
|
|
def domain(self, url):
|
|
d = url.split('/')
|
|
if len(d) > 2:
|
|
return d[2].split('?')[0]
|
|
return url
|
|
|
|
#expand urls in case they are relative to the archive
|
|
def _get_archiveUrl(self):
|
|
return self.archive.full_url(self._SO_get_archiveUrl())
|
|
|
|
def _get_downloadUrl(self):
|
|
return self.archive.full_url(self._SO_get_downloadUrl())
|
|
|
|
def _get_icon(self):
|
|
return self.archive.full_url(self._SO_get_icon())
|
|
|
|
def _get_json(self):
|
|
result = jsonify_sqlobject(self)
|
|
result['relDate'] = self.relDate.strftime('%s')
|
|
result['pubDate'] = self.pubDate.strftime('%s')
|
|
result['modDate'] = self.relDate.strftime('%s')
|
|
return result
|
|
'''
|
|
return dict(
|
|
title = self.title,
|
|
description = self.description,
|
|
html = self.html,
|
|
text = self.text,
|
|
author = self.author,
|
|
archiveUrl = self.archiveUrl,
|
|
downloadUrl = self.downloadUrl,
|
|
size = self.size,
|
|
icon = '/view/%s/icon.png' % self.hash,
|
|
relDate = self.relDate.strftime('%s'),
|
|
pubDate = self.pubDate.strftime('%s'),
|
|
size = self.size,
|
|
)
|
|
'''
|
|
|
|
def update(self, data):
|
|
for key in data:
|
|
setattr(self, key, data[key])
|
|
self.setHashId()
|
|
|
|
def setHashId(self):
|
|
salt = u'%s/%s' % (self.archive.archiveName, self.archiveItemId)
|
|
self.hashID = md5.new(salt.encode('utf-8')).hexdigest()
|
|
|
|
def htmlHighlight(self, term):
|
|
return utils.highlightText(self.html, term)
|
|
|
|
|
|
class Archive(SQLObject):
|
|
archiveId = UnicodeCol(alternateID = True, length = 1000)
|
|
archiveName = UnicodeCol()
|
|
archiveUrl = UnicodeCol()
|
|
ttl = IntCol(default = "900") #seconds
|
|
pubDate = DateTimeCol(default=datetime.now)
|
|
modDate = DateTimeCol(default=datetime.now)
|
|
created = DateTimeCol(default=datetime.now)
|
|
initialized = BoolCol(default = False)
|
|
css = UnicodeCol(default='')
|
|
js = UnicodeCol(default='')
|
|
icon = UnicodeCol() # -> url (128x128)
|
|
|
|
hashId = UnicodeCol(alternateID = True, length=128)
|
|
|
|
def setHashId(self):
|
|
self.hashId = md5.new("%s" % self.id).hexdigest()
|
|
|
|
def _get_modDateTimestamp(self):
|
|
if self.initialized:
|
|
return int(time.mktime(self.modDate.timetuple()))
|
|
return -1
|
|
|
|
def _query_url(self, query):
|
|
url = "%s?" % self.archiveUrl
|
|
url += "&".join(["%s=%s" % (key, quote("%s" % query[key])) for key in query])
|
|
return url
|
|
|
|
def _get_update_url(self):
|
|
return self._query_url({'modDate': self.modDateTimestamp})
|
|
|
|
def _get_metadata_url(self):
|
|
return self._query_url({'metadata': '1'})
|
|
|
|
def data_url(self, id):
|
|
return self._query_url({'id': id})
|
|
|
|
def full_url(self, url):
|
|
if not url:
|
|
return ''
|
|
if url.find('://') > 0:
|
|
return url
|
|
if url.startswith('/'):
|
|
domain = "://".join(urlparse.urlsplit(self.archiveUrl)[0:2])
|
|
url = "%s%s" % (domain, url)
|
|
else:
|
|
url = "%s/%s" % (self.archiveUrl, url)
|
|
return url
|
|
|
|
def _get_iconUrl(self):
|
|
if self.icon:
|
|
return "/icon/%s.png" % self.hashId
|
|
else:
|
|
return "/static/images/iconCollection.png"
|
|
|
|
def update(self):
|
|
result = simplejson.loads(read_url(self.metadata_url))
|
|
if result:
|
|
if result.has_key('name'):
|
|
self.archiveName = result['name']
|
|
if result.has_key('id'):
|
|
self.archiveId = result['id']
|
|
if result.has_key('ttl'):
|
|
self.ttl = int(result['ttl'])
|
|
if result.has_key('icon'):
|
|
self.icon = result['icon']
|
|
if result.has_key('css'):
|
|
try:
|
|
data = read_url(self.full_url(result['css']))
|
|
self.css = data
|
|
except:
|
|
self.css = ''
|
|
if result.has_key('js'):
|
|
try:
|
|
data = read_url(self.full_url(result['js']))
|
|
self.js = data
|
|
except:
|
|
self.js = ''
|
|
else:
|
|
self.icon = ''
|
|
self.js = ''
|
|
self.css = ''
|
|
result = simplejson.loads(read_url(self.update_url))
|
|
items = result.get('items', [])
|
|
print "importing", len(items), "items"
|
|
for id in items:
|
|
try:
|
|
self.updateItem(id)
|
|
except:
|
|
print "failed to load ", id, "from ", self.data_url(id)
|
|
continue
|
|
self.initialized = True
|
|
self.modDate = datetime.now()
|
|
|
|
def updateItem(self, id):
|
|
data = read_url(self.data_url(id))
|
|
data = jsonLoadArchiveItem(data)
|
|
print data['title'].encode('utf-8')
|
|
q = ArchiveItem.select(AND(
|
|
ArchiveItem.q.archiveItemId == id,
|
|
ArchiveItem.q.archiveID == self.id))
|
|
if q.count() == 0:
|
|
jsonImportArchiveItem(self, id, data)
|
|
else:
|
|
q[0].update(data)
|
|
|
|
'''
|
|
get list of all items from archive and remove those from ArchiveItem that
|
|
are no longer in the list
|
|
'''
|
|
def cleanUp(self):
|
|
url = self._query_url({'modDate': -1})
|
|
result = simplejson.loads(read_url(url))
|
|
archiveItems = result.get('items', [])
|
|
archivedItems = {}
|
|
for i in ArchiveItem.select(ArchiveItem.q.archiveID == self.id):
|
|
archivedItems[i.archiveItemId] = i.id
|
|
removeItems = filter(lambda i: i not in archiveItems, archivedItems.keys())
|
|
for i in removeItems: ArchiveItem.delete(archivedItems[i])
|
|
|
|
class SortName(SQLObject):
|
|
name =UnicodeCol(length=1000, alternateID=True)
|
|
|
|
# identity models.
|
|
class Visit(SQLObject):
|
|
class sqlmeta:
|
|
table = "visit"
|
|
|
|
visit_key = StringCol(length=40, alternateID=True,
|
|
alternateMethodName="by_visit_key")
|
|
created = DateTimeCol(default=datetime.now)
|
|
expiry = DateTimeCol()
|
|
|
|
def lookup_visit(cls, visit_key):
|
|
try:
|
|
return cls.by_visit_key(visit_key)
|
|
except SQLObjectNotFound:
|
|
return None
|
|
lookup_visit = classmethod(lookup_visit)
|
|
|
|
class VisitIdentity(SQLObject):
|
|
visit_key = StringCol(length=40, alternateID=True,
|
|
alternateMethodName="by_visit_key")
|
|
user_id = IntCol()
|
|
|
|
|
|
class Group(SQLObject):
|
|
"""
|
|
An ultra-simple group definition.
|
|
"""
|
|
|
|
# names like "Group", "Order" and "User" are reserved words in SQL
|
|
# so we set the name to something safe for SQL
|
|
class sqlmeta:
|
|
table = "tg_group"
|
|
|
|
group_name = UnicodeCol(length=16, alternateID=True,
|
|
alternateMethodName="by_group_name")
|
|
display_name = UnicodeCol(length=255)
|
|
created = DateTimeCol(default=datetime.now)
|
|
|
|
# collection of all users belonging to this group
|
|
users = RelatedJoin("User", intermediateTable="user_group",
|
|
joinColumn="group_id", otherColumn="user_id")
|
|
|
|
# collection of all permissions for this group
|
|
permissions = RelatedJoin("Permission", joinColumn="group_id",
|
|
intermediateTable="group_permission",
|
|
otherColumn="permission_id")
|
|
|
|
|
|
class User(SQLObject):
|
|
"""
|
|
Reasonably basic User definition. Probably would want additional attributes.
|
|
"""
|
|
# names like "Group", "Order" and "User" are reserved words in SQL
|
|
# so we set the name to something safe for SQL
|
|
class sqlmeta:
|
|
table = "tg_user"
|
|
|
|
user_name = UnicodeCol(length=16, alternateID=True,
|
|
alternateMethodName="by_user_name")
|
|
email_address = UnicodeCol(length=255, alternateID=True,
|
|
alternateMethodName="by_email_address")
|
|
display_name = UnicodeCol(length=255)
|
|
password = UnicodeCol(length=40)
|
|
created = DateTimeCol(default=datetime.now)
|
|
|
|
# groups this user belongs to
|
|
groups = RelatedJoin("Group", intermediateTable="user_group",
|
|
joinColumn="user_id", otherColumn="group_id")
|
|
|
|
def _get_permissions(self):
|
|
perms = set()
|
|
for g in self.groups:
|
|
perms = perms | set(g.permissions)
|
|
return perms
|
|
|
|
def _set_password(self, cleartext_password):
|
|
"Runs cleartext_password through the hash algorithm before saving."
|
|
hash = identity.encrypt_password(cleartext_password)
|
|
self._SO_set_password(hash)
|
|
|
|
def set_password_raw(self, password):
|
|
"Saves the password as-is to the database."
|
|
self._SO_set_password(password)
|
|
|
|
|
|
|
|
class Permission(SQLObject):
|
|
permission_name = UnicodeCol(length=16, alternateID=True,
|
|
alternateMethodName="by_permission_name")
|
|
description = UnicodeCol(length=255)
|
|
|
|
groups = RelatedJoin("Group",
|
|
intermediateTable="group_permission",
|
|
joinColumn="permission_id",
|
|
otherColumn="group_id")
|
|
|
|
|