pandora/pandora/document/models.py

795 lines
30 KiB
Python
Raw Normal View History

2013-03-24 12:28:57 +00:00
# -*- coding: utf-8 -*-
from __future__ import division, print_function, absolute_import
2013-03-24 12:28:57 +00:00
import os
import re
2014-02-14 13:35:40 +00:00
from glob import glob
2016-10-04 22:00:03 +00:00
import unicodedata
2013-03-24 12:28:57 +00:00
2017-01-24 15:19:34 +00:00
from six import PY2, string_types
2016-09-20 13:59:49 +00:00
from six.moves.urllib.parse import quote, unquote
2016-10-04 22:00:03 +00:00
from django.db import models, transaction
from django.db.models import Q, Sum, Max
2018-07-29 20:12:56 +00:00
from django.contrib.auth import get_user_model
2013-03-24 12:28:57 +00:00
from django.db.models.signals import pre_delete
2016-10-04 22:00:03 +00:00
from django.conf import settings
2017-03-03 07:56:35 +00:00
from django.utils.encoding import python_2_unicode_compatible
2018-06-19 18:48:18 +00:00
from oxdjango.fields import JSONField
2013-03-24 12:28:57 +00:00
2016-06-25 18:36:20 +00:00
from PIL import Image
2013-03-24 12:28:57 +00:00
import ox
2016-10-04 22:00:03 +00:00
from oxdjango.sortmodel import get_sort_field
from person.models import get_name_sort
2013-05-27 11:21:08 +00:00
from item.models import Item
2017-01-26 15:56:28 +00:00
from annotation.models import Annotation
from archive.extract import resize_image
from archive.chunk import save_chunk
2018-07-29 20:28:46 +00:00
from user.models import Group
2013-05-27 11:21:08 +00:00
from . import managers
from . import utils
from .fulltext import FulltextMixin
2013-03-24 12:28:57 +00:00
2018-07-29 20:12:56 +00:00
User = get_user_model()
2017-01-24 15:19:34 +00:00
if not PY2:
unicode = str
2013-03-24 12:28:57 +00:00
2016-10-04 22:00:03 +00:00
def get_path(f, x):
return f.path(x)
2013-03-24 12:28:57 +00:00
2017-03-03 07:56:35 +00:00
@python_2_unicode_compatible
class Document(models.Model, FulltextMixin):
2013-03-24 12:28:57 +00:00
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
2016-10-04 22:00:03 +00:00
user = models.ForeignKey(User, related_name='documents')
groups = models.ManyToManyField(Group, blank=True, related_name='documents')
2013-03-24 12:28:57 +00:00
extension = models.CharField(max_length=255)
size = models.IntegerField(default=0)
matches = models.IntegerField(default=0)
2017-01-25 16:41:06 +00:00
ratio = models.FloatField(default=-1)
pages = models.IntegerField(default=-1)
width = models.IntegerField(default=-1)
height = models.IntegerField(default=-1)
2016-10-04 22:00:03 +00:00
2013-03-24 12:28:57 +00:00
oshash = models.CharField(max_length=16, unique=True, null=True)
2016-10-04 22:00:03 +00:00
file = models.FileField(default=None, blank=True, null=True, upload_to=get_path)
2013-03-24 12:28:57 +00:00
objects = managers.DocumentManager()
2016-10-04 22:00:03 +00:00
uploading = models.BooleanField(default=False)
2013-03-24 12:28:57 +00:00
2013-05-27 11:21:08 +00:00
items = models.ManyToManyField(Item, through='ItemProperties', related_name='documents')
2017-01-26 15:56:28 +00:00
annotations = models.ManyToManyField(Annotation, related_name='documents')
linked_documents = models.ManyToManyField('Document', related_name='linking_documents')
2013-05-27 11:21:08 +00:00
2016-10-04 22:00:03 +00:00
rightslevel = models.IntegerField(db_index=True, default=0)
2018-06-19 18:48:18 +00:00
data = JSONField(default=dict, editable=False)
2016-10-04 22:00:03 +00:00
def update_access(self, user):
if not user.is_authenticated():
user = None
access, created = Access.objects.get_or_create(document=self, user=user)
if not created:
access.save()
def update_facet(self, key):
current_values = self.get_value(key, [])
if key == 'name':
current_values = []
for k in settings.CONFIG['documentKeys']:
if k.get('sortType') == 'person':
current_values += self.get(k['id'], [])
if not isinstance(current_values, list):
if not current_values:
current_values = []
else:
current_values = [unicode(current_values)]
filter_map = utils.get_by_id(settings.CONFIG['documentKeys'], key).get('filterMap')
if filter_map:
filter_map = re.compile(filter_map)
_current_values = []
for value in current_values:
value = filter_map.findall(value)
if value:
_current_values.append(value[0])
current_values = _current_values
current_values = list(set(current_values))
current_values = [ox.decode_html(ox.strip_tags(v)) for v in current_values]
current_values = [unicodedata.normalize('NFKD', v) for v in current_values]
self.update_facet_values(key, current_values)
def update_facet_values(self, key, current_values):
current_sortvalues = set([value.lower() for value in current_values])
saved_values = [i.value.lower() for i in Facet.objects.filter(document=self, key=key)]
2017-02-16 13:24:51 +00:00
removed_values = list(filter(lambda i: i not in current_sortvalues, saved_values))
2016-10-04 22:00:03 +00:00
if removed_values:
q = Q()
for v in removed_values:
q |= Q(value__iexact=v)
2017-01-25 20:45:54 +00:00
r = Facet.objects.filter(document=self, key=key).filter(q).delete()
2016-10-04 22:00:03 +00:00
for value in current_values:
if value.lower() not in saved_values:
sortvalue = value
if key in self.person_keys + ['name']:
sortvalue = get_name_sort(value)
sortvalue = utils.sort_string(sortvalue).lower()[:900]
f, created = Facet.objects.get_or_create(document=self, key=key, value=value, sortvalue=sortvalue)
if created:
Facet.objects.filter(document=self, key=key, value__iexact=value).exclude(value=value).delete()
Facet.objects.filter(key=key, value__iexact=value).exclude(value=value).update(value=value)
saved_values.append(value.lower())
def update_facets(self):
for key in set(self.facet_keys + ['title']):
self.update_facet(key)
def update_find(self):
def save(key, value):
if value not in ('', None):
f, created = Find.objects.get_or_create(document=self, key=key)
if isinstance(value, bool):
value = value and 'true' or 'false'
if isinstance(value, string_types):
value = ox.decode_html(ox.strip_tags(value.strip()))
value = unicodedata.normalize('NFKD', value).lower()
f.value = value
f.save()
else:
Find.objects.filter(document=self, key=key).delete()
with transaction.atomic():
data = self.json()
for key in settings.CONFIG['documentKeys']:
i = key['id']
if i == 'rightslevel':
save(i, self.rightslevel)
if key.get('fulltext'):
continue
elif i not in ('*', 'dimensions') and i not in self.facet_keys:
2016-10-04 22:00:03 +00:00
value = data.get(i)
if isinstance(value, list):
value = u'\n'.join(value)
save(i, value)
base_keys = ('id', 'size', 'dimensions', 'extension', 'matches')
def update_sort(self):
try:
s = self.sort
except Sort.DoesNotExist:
s = Sort(document=self)
s.id = self.id
s.extension = self.extension
s.size = self.size
s.matches = self.matches
2016-11-30 10:36:29 +00:00
2016-10-04 22:00:03 +00:00
if self.extension == 'pdf':
2016-11-30 10:36:29 +00:00
prefix = 2
value = self.pages
2016-10-04 22:00:03 +00:00
else:
if self.extension == 'html':
2016-11-30 10:36:29 +00:00
prefix = 1
value = self.dimensions
2016-10-04 22:00:03 +00:00
else:
2016-11-30 10:36:29 +00:00
prefix = 0
value = self.width * self.height
if value < 0:
value = 0
s.dimensions = ox.sort_string('%d' % prefix) + ox.sort_string('%d' % value)
2016-10-04 22:00:03 +00:00
def sortNames(values):
sort_value = u''
if values:
sort_value = u'; '.join([get_name_sort(name) for name in values])
if not sort_value:
sort_value = u''
return sort_value.lower()
2016-10-04 22:00:03 +00:00
def set_value(s, name, value):
if isinstance(value, string_types):
value = ox.decode_html(value.lower())
if not value:
value = None
setattr(s, name, value)
def get_value(source, key):
2017-01-26 15:56:28 +00:00
value = self.get_value(source)
2016-10-04 22:00:03 +00:00
return value
def get_words(source, key):
value = get_value(source, key)
if isinstance(value, list):
value = '\n'.join(value)
value = len(value.split(' ')) if value else 0
return value
2017-02-16 13:24:51 +00:00
for key in list(filter(lambda k: k.get('sort', False), settings.CONFIG['documentKeys'])):
2016-10-04 22:00:03 +00:00
name = key['id']
if name not in self.base_keys:
source = name
sort_type = key.get('sortType', key['type'])
if 'value' in key:
if 'key' in key['value']:
source = key['value']['key']
sort_type = key['value'].get('type', sort_type)
if isinstance(sort_type, list):
sort_type = sort_type[0]
if sort_type == 'title':
value = self.get_value(source, u'Untitled')
value = utils.sort_title(value)[:955]
set_value(s, name, value)
elif sort_type == 'person':
value = sortNames(self.get_value(source, []))
value = utils.sort_string(value)[:955]
set_value(s, name, value)
elif sort_type == 'string':
value = self.get_value(source, u'')
if isinstance(value, list):
value = u','.join(value)
2018-12-04 18:14:24 +00:00
if not isinstance(value, str):
value = str(value)
2016-10-04 22:00:03 +00:00
value = utils.sort_string(value)[:955]
set_value(s, name, value)
elif sort_type == 'words':
value = get_words(source, key) if s.duration else None
set_value(s, name, value)
elif sort_type == 'wordsperminute':
value = get_words(source, key)
value = value / (s.duration / 60) if value and s.duration else None
set_value(s, name, value)
elif sort_type in ('length', 'integer', 'time', 'float'):
# can be length of strings or length of arrays, i.e. keywords
2017-01-26 15:56:28 +00:00
value = self.get_value(source)
2016-10-04 22:00:03 +00:00
if isinstance(value, list):
value = len(value)
set_value(s, name, value)
elif sort_type == 'year':
value = self.get_value(source)
set_value(s, name, value)
elif sort_type == 'date':
value = self.get_value(source)
if isinstance(value, string_types):
value = datetime_safe.datetime.strptime(value, '%Y-%m-%d')
set_value(s, name, value)
s.save()
2013-03-24 12:28:57 +00:00
def save(self, *args, **kwargs):
if not self.uploading:
if self.file:
self.size = self.file.size
self.get_info()
2016-10-04 22:00:03 +00:00
if self.extension == 'html':
self.size = len(self.data.get('text', ''))
2016-11-30 10:36:29 +00:00
is_ready = not self.uploading and (self.file or self.extension == 'html')
2016-10-04 22:00:03 +00:00
if self.id:
2016-11-30 10:36:29 +00:00
if is_ready:
self.update_sort()
self.update_find()
self.update_facets()
2016-10-04 22:00:03 +00:00
new = False
2014-01-07 16:19:27 +00:00
else:
2016-10-04 22:00:03 +00:00
new = True
super(Document, self).save(*args, **kwargs)
2016-10-04 22:00:03 +00:00
if new:
2016-11-30 10:36:29 +00:00
if is_ready:
self.update_sort()
self.update_find()
self.update_facets()
self.update_matches()
2017-01-26 15:56:28 +00:00
self.update_linked_documents()
2013-03-24 12:28:57 +00:00
2017-03-03 07:56:35 +00:00
def __str__(self):
2013-03-24 12:28:57 +00:00
return self.get_id()
2013-05-27 11:21:08 +00:00
def add(self, item):
p, created = ItemProperties.objects.get_or_create(item=item, document=self)
if created:
p.index = ItemProperties.objects.filter(item=item).aggregate(Max('index'))['index__max'] + 1
p.save()
p.document.update_matches()
item.update_sort()
2013-05-27 11:21:08 +00:00
def remove(self, item):
ItemProperties.objects.filter(item=item, document=self).delete()
2013-03-24 12:28:57 +00:00
@classmethod
def get(cls, id):
return cls.objects.get(pk=ox.fromAZ(id))
2013-03-24 12:28:57 +00:00
def get_absolute_url(self):
2013-05-27 20:03:18 +00:00
return ('/documents/%s' % quote(self.get_id())).replace('%3A', ':')
2013-03-24 12:28:57 +00:00
def get_id(self):
return ox.toAZ(self.id)
2013-03-24 12:28:57 +00:00
2017-01-24 23:07:04 +00:00
def access(self, user):
if user.is_anonymous():
level = 'guest'
else:
level = user.profile.get_level()
editable = self.editable(user)
if editable:
return True
allowed_level = settings.CONFIG['capabilities']['canSeeDocument'][level]
if self.rightslevel <= allowed_level:
return True
return False
2016-10-04 22:00:03 +00:00
def editable(self, user, item=None):
2013-03-24 12:28:57 +00:00
if not user or user.is_anonymous():
return False
if self.user == user or \
user.is_staff or \
2016-10-04 22:00:03 +00:00
user.profile.capability('canEditDocuments') is True or \
(item and item.editable(user)):
2013-03-24 12:28:57 +00:00
return True
return False
2013-05-27 11:21:08 +00:00
def edit(self, data, user, item=None):
if item:
p, created = ItemProperties.objects.get_or_create(item=item, document=self)
if 'description' in data:
p.description = ox.sanitize_html(data['description'])
p.save()
2016-10-04 22:00:03 +00:00
else:
for key in data:
k = list(filter(lambda i: i['id'] == key, settings.CONFIG['documentKeys']))
ktype = k and k[0].get('type') or ''
if key == 'text' and self.extension == 'html':
self.data['text'] = ox.sanitize_html(data['text'], global_attributes=[
'data-name',
'data-type',
'data-value',
'lang'
])
2017-01-24 23:07:04 +00:00
elif key == 'rightslevel':
setattr(self, key, int(data[key]))
2016-10-04 22:00:03 +00:00
elif ktype == 'text':
2017-03-12 12:50:13 +00:00
if data[key]:
self.data[key] = ox.sanitize_html(data[key])
2017-03-12 12:51:52 +00:00
elif key in self.data:
2017-03-12 12:50:13 +00:00
del self.data[key]
2016-10-04 22:00:03 +00:00
elif ktype == '[text]':
self.data[key] = [ox.sanitize_html(t) for t in data[key]]
elif ktype == '[string]':
self.data[key] = [ox.escape_html(t) for t in data[key]]
elif isinstance(data[key], string_types):
self.data[key] = ox.escape_html(data[key])
elif isinstance(data[key], list):
def cleanup(i):
if isinstance(i, string_types):
i = ox.escape_html(i)
return i
self.data[key] = [cleanup(i) for i in data[key]]
elif isinstance(data[key], int) or isinstance(data[key], float):
self.data[key] = data[key]
else:
2017-01-25 20:45:54 +00:00
if data[key]:
self.data[key] = ox.escape_html(data[key])
2017-02-21 16:46:16 +00:00
elif key in self.data:
2017-01-25 20:45:54 +00:00
del self.data[key]
2013-03-24 12:28:57 +00:00
2014-01-07 11:05:10 +00:00
@property
def dimensions(self):
if self.extension == 'pdf':
return self.pages
2016-10-04 22:00:03 +00:00
elif self.extension == 'html':
return len(self.data.get('text', '').split(' '))
2014-01-07 11:05:10 +00:00
else:
return self.resolution
@property
def resolution(self):
return [self.width, self.height]
2016-10-04 22:00:03 +00:00
def get_value(self, key, default=None):
if key in (
'extension',
'id',
'matches',
'ratio',
'size',
2017-01-24 23:07:04 +00:00
'rightslevel',
2016-10-04 22:00:03 +00:00
):
return getattr(self, key)
2019-06-11 11:30:18 +00:00
document_key = utils.get_by_id(settings.CONFIG['documentKeys'], key)
if document_key and 'value' in document_key \
and isinstance(document_key['value'], dict) \
and document_key['value'].get('type') == 'map' \
2019-06-11 11:32:00 +00:00
and self.get_value(document_key['value']['key']):
2019-06-11 11:30:18 +00:00
value = re.compile(document_key['value']['map']).findall(self.get_value(document_key['value']['key']))
2019-12-05 16:41:03 +00:00
if value and document_key['value'].get('format'):
value = [document_key['value']['format'].format(value[0])]
2019-06-11 11:30:18 +00:00
return value[0] if value else default
2016-10-04 22:00:03 +00:00
elif key == 'user':
return self.user.username
else:
return self.data.get(key, default)
2013-05-27 11:21:08 +00:00
def json(self, keys=None, user=None, item=None):
2013-03-24 12:28:57 +00:00
if not keys:
2016-10-04 22:00:03 +00:00
keys = [
2013-03-24 12:28:57 +00:00
'description',
2014-01-07 11:05:10 +00:00
'dimensions',
2013-03-24 12:28:57 +00:00
'editable',
2015-02-13 11:06:09 +00:00
'entities',
'extension',
2013-03-24 12:28:57 +00:00
'id',
'oshash',
2016-10-04 22:00:03 +00:00
'title',
2013-03-24 12:28:57 +00:00
'ratio',
2016-10-04 22:00:03 +00:00
'matches',
2015-02-13 11:06:09 +00:00
'size',
2014-01-07 11:05:10 +00:00
'user',
2017-01-25 20:45:54 +00:00
'referenced',
2013-03-24 12:28:57 +00:00
]
2016-10-04 22:00:03 +00:00
if self.extension in ('html', 'txt'):
keys.append('text')
for key in settings.CONFIG['documentKeys']:
if key['id'] in ('*', ):
continue
if key['id'] not in keys:
keys.append(key['id'])
2013-03-24 12:28:57 +00:00
response = {}
_map = {
}
for key in keys:
if key == 'id':
response[key] = self.get_id()
elif key == 'editable':
response[key] = self.editable(user)
elif key == 'user':
response[key] = self.user.username
2016-10-04 22:00:03 +00:00
elif key == 'accessed':
response[key] = self.accessed.aggregate(Max('access'))['access__max']
elif key == 'timesaccessed':
response[key] = self.accessed.aggregate(Sum('accessed'))['accessed__sum']
2015-02-13 11:06:09 +00:00
elif key == 'entities':
dps = self.documentproperties.select_related('entity').order_by('index')
response[key] = entity_jsons = []
for dp in dps:
entity_json = dp.entity.json(['id', 'type', 'name'])
entity_json['data'] = dp.data
entity_jsons.append(entity_json)
2017-01-25 20:45:54 +00:00
elif key == 'referenced':
response[key] = self.referenced()
2016-10-04 22:00:03 +00:00
elif key in self.data:
response[key] = self.data[key]
2013-03-24 12:28:57 +00:00
elif hasattr(self, _map.get(key, key)):
2016-10-04 22:00:03 +00:00
response[key] = getattr(self, _map.get(key, key)) or ''
if self.extension == 'html':
response['text'] = self.data.get('text', '')
2013-05-27 11:21:08 +00:00
if item:
if isinstance(item, string_types):
2014-09-19 12:26:46 +00:00
item = Item.objects.get(public_id=item)
2013-05-27 11:21:08 +00:00
d = self.descriptions.filter(item=item)
if d.exists():
if 'description' in keys and d[0].description:
response['description'] = d[0].description
response['index'] = d[0].index
2017-01-25 16:41:06 +00:00
if response.get('ratio') == -1:
response['ratio'] = settings.CONFIG['posters']['ratio']
2016-10-04 22:00:03 +00:00
if keys:
for key in list(response):
if key not in keys:
del response[key]
2013-03-24 12:28:57 +00:00
return response
def path(self, name=''):
2013-12-23 11:30:22 +00:00
h = ox.toAZ(self.id)
h = (7-len(h))*'0' + h
return os.path.join('documents', h[:2], h[2:4], h[4:6], h[6:], name)
2013-03-24 12:28:57 +00:00
def save_chunk(self, chunk, offset=None, done=False):
2013-03-24 12:28:57 +00:00
if self.uploading:
name = 'data.%s' % self.extension
name = self.path(name)
def done_cb():
if done:
self.uploading = False
self.get_info()
self.get_ratio()
self.oshash = ox.oshash(self.file.path)
self.save()
2016-09-23 11:38:00 +00:00
self.delete_cache()
2019-12-02 18:16:13 +00:00
self.update_fulltext()
return True, self.file.size
return save_chunk(self, self.file, chunk, offset, name, done_cb)
return False, 0
2013-03-24 12:28:57 +00:00
2014-02-02 06:30:58 +00:00
def thumbnail(self, size=None, page=None):
2016-10-04 22:00:03 +00:00
if not self.file:
2017-01-25 16:41:06 +00:00
return os.path.join(settings.STATIC_ROOT, 'png/document.png')
src = self.file.path
2014-02-02 06:30:58 +00:00
folder = os.path.dirname(src)
if size:
size = int(size)
2014-02-02 06:30:58 +00:00
path = os.path.join(folder, '%d.jpg' % size)
else:
path = src
2014-02-02 06:30:58 +00:00
if self.extension == 'pdf':
2019-12-02 12:38:56 +00:00
crop = []
2014-02-02 06:30:58 +00:00
if page:
2019-12-02 12:38:56 +00:00
if ',' in page:
crop = list(map(int, page.split(',')))
page = crop[0]
crop = crop[1:]
else:
page = int(page)
2014-02-02 06:30:58 +00:00
if page and page > 1 and page <= self.pages:
src = os.path.join(folder, '1024p%d.jpg' % page)
else:
src = os.path.join(folder, '1024p1.jpg')
page = 1
if not os.path.exists(src):
self.extract_page(page)
if size:
path = os.path.join(folder, '%dp%d.jpg' % (size, page))
2019-12-02 12:38:56 +00:00
if len(crop) == 4:
2019-12-02 12:40:52 +00:00
path = os.path.join(folder, '%dp%d,%s.jpg' % (1024, page, ','.join(map(str, crop))))
2019-12-02 12:38:56 +00:00
if not os.path.exists(path):
img = Image.open(src).crop(crop)
img.save(path)
else:
img = Image.open(path)
src = path
if size < max(img.size):
2019-12-02 12:40:52 +00:00
path = os.path.join(folder, '%dp%d,%s.jpg' % (size, page, ','.join(map(str, crop))))
2019-12-02 12:38:56 +00:00
if not os.path.exists(path):
resize_image(src, path, size=size)
2015-02-05 08:08:28 +00:00
elif self.extension in ('jpg', 'png', 'gif'):
if os.path.exists(src):
if size and page:
crop = list(map(int, page.split(',')))
2016-02-19 16:59:02 +00:00
if len(crop) == 4:
path = os.path.join(folder, '%s.jpg' % ','.join(map(str, crop)))
2015-02-05 08:08:28 +00:00
if not os.path.exists(path):
2016-02-19 16:59:02 +00:00
img = Image.open(src).crop(crop)
img.save(path)
else:
img = Image.open(path)
src = path
if size < max(img.size):
path = os.path.join(folder, '%sp%s.jpg' % (size, ','.join(map(str, crop))))
if not os.path.exists(path):
resize_image(src, path, size=size)
if os.path.exists(src) and not os.path.exists(path):
image_size = max(self.width, self.height)
if image_size == -1:
image_size = max(*Image.open(src).size)
if size > image_size:
path = src
else:
resize_image(src, path, size=size)
return path
2013-03-24 12:28:57 +00:00
2014-02-02 06:30:58 +00:00
def extract_page(self, page):
pdf = self.file.path
image = os.path.join(os.path.dirname(pdf), '1024p%d.jpg' % page)
utils.extract_pdfpage(pdf, image, page)
2013-03-24 12:28:57 +00:00
def get_info(self):
if self.extension == 'pdf':
2014-11-01 16:09:33 +00:00
self.thumbnail(1024)
if self.pages == -1:
2014-02-01 11:34:40 +00:00
self.width = -1
self.height = -1
self.pages = utils.pdfpages(self.file.path)
elif self.width == -1:
self.pages = -1
self.width, self.height = Image.open(self.file.path).size
2013-03-24 12:28:57 +00:00
def get_ratio(self):
if self.extension == 'pdf':
2014-11-01 13:42:16 +00:00
image = self.thumbnail(1024)
try:
size = Image.open(image).size
except:
2016-10-04 22:00:03 +00:00
size = [1, 1]
2014-01-05 11:40:54 +00:00
else:
if self.width > 0:
size = self.resolution
else:
2017-01-25 16:41:06 +00:00
size = [-1, 1]
2013-03-24 12:28:57 +00:00
self.ratio = size[0] / size[1]
2014-01-05 11:40:54 +00:00
return self.ratio
2013-03-24 12:28:57 +00:00
2017-01-25 20:45:54 +00:00
def urls(self):
2013-03-24 12:53:32 +00:00
urls = [self.get_absolute_url()]
url = unquote(urls[0])
if url != urls[0]:
urls.append(url)
2017-01-25 20:45:54 +00:00
return urls
def referenced(self):
result = {}
2017-01-26 15:56:28 +00:00
result['items'] = [
2018-06-19 22:03:01 +00:00
i.json(keys=['id', 'title'])
2017-01-26 15:56:28 +00:00
for i in self.items.all().order_by('sort__title')
]
result['annotations'] = [
a.json(keys=['id', 'title', 'in'])
for a in self.annotations.all().order_by('start', 'end')
]
result['documents'] = [
d.json(keys=['id', 'title'])
for d in self.linking_documents.all().order_by('sort__title')
]
result['entities'] = [
e.json(keys=['id', 'name'])
for e in self.entities.all()
]
2017-01-25 20:45:54 +00:00
return result
2017-01-26 15:56:28 +00:00
def update_linked_documents(self):
if self.extension == 'html':
2017-02-16 17:31:24 +00:00
old = [d.id for d in self.linked_documents.all()]
2017-01-27 12:26:55 +00:00
current = utils.get_documents(self.data.get('text', ''))
2017-01-26 15:56:28 +00:00
removed = list(set(old) - set(current))
added = list(set(current) - set(old))
if removed:
for document in Document.objects.filter(id__in=removed):
self.linked_documents.remove(document)
if added:
for document in Document.objects.filter(id__in=added):
self.linked_documents.add(document)
2017-01-25 20:45:54 +00:00
def update_matches(self):
urls = self.urls()
2015-02-13 11:06:09 +00:00
matches = self.items.count() + self.entities.count()
2013-03-24 12:53:32 +00:00
for url in urls:
2017-01-26 15:56:28 +00:00
matches += Annotation.objects.filter(value__contains=url).count()
matches += Item.objects.filter(data__contains=url).count()
2017-01-25 20:45:54 +00:00
matches += Document.objects.filter(extension='html', data__contains=url).count()
2013-03-24 12:53:32 +00:00
if matches != self.matches:
Document.objects.filter(id=self.id).update(matches=matches)
2013-03-24 12:53:32 +00:00
self.matches = matches
2013-03-24 12:28:57 +00:00
2016-09-23 11:38:00 +00:00
def delete_cache(self):
if self.file:
folder = os.path.dirname(self.file.path)
for f in glob('%s/*' % folder):
if f != self.file.path:
os.unlink(f)
def delete_document(sender, **kwargs):
2013-03-24 12:28:57 +00:00
t = kwargs['instance']
if t.file:
2016-09-23 11:38:00 +00:00
t.delete_cache()
t.file.delete(save=False)
2019-12-01 16:35:54 +00:00
t.delete_fulltext()
pre_delete.connect(delete_document, sender=Document)
2013-03-24 12:28:57 +00:00
2013-05-27 11:21:08 +00:00
class ItemProperties(models.Model):
class Meta:
unique_together = ("item", "document")
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
item = models.ForeignKey(Item)
document = models.ForeignKey(Document, related_name='descriptions')
description = models.TextField(default="")
index = models.IntegerField(default=0)
2014-01-08 04:32:57 +00:00
description_sort = models.CharField(max_length=512, null=True)
def save(self, *args, **kwargs):
if self.description:
self.description_sort = ox.sort_string(self.description)[:512].lower()
else:
2016-10-04 22:00:03 +00:00
self.description_sort = self.document.sort.description
2014-01-08 04:32:57 +00:00
super(ItemProperties, self).save(*args, **kwargs)
2016-10-04 22:00:03 +00:00
2017-03-03 07:56:35 +00:00
@python_2_unicode_compatible
2016-10-04 22:00:03 +00:00
class Access(models.Model):
class Meta:
unique_together = ("document", "user")
access = models.DateTimeField(auto_now=True)
document = models.ForeignKey(Document, related_name='accessed')
user = models.ForeignKey(User, null=True, related_name='accessed_documents')
accessed = models.IntegerField(default=0)
def save(self, *args, **kwargs):
if not self.accessed:
self.accessed = 0
self.accessed += 1
super(Access, self).save(*args, **kwargs)
timesaccessed = Access.objects.filter(document=self.document).aggregate(Sum('accessed'))['accessed__sum']
Sort.objects.filter(document=self.document).update(timesaccessed=timesaccessed, accessed=self.access)
2017-03-03 07:56:35 +00:00
def __str__(self):
2016-10-04 22:00:03 +00:00
if self.user:
return u"%s/%s/%s" % (self.user, self.document, self.access)
return u"%s/%s" % (self.item, self.access)
2017-03-03 07:56:35 +00:00
@python_2_unicode_compatible
2016-10-04 22:00:03 +00:00
class Facet(models.Model):
'''
used for keys that can have multiple values like people, languages etc.
does not perform to well if total number of items goes above 10k
this happens for keywords in 0xdb right now
'''
class Meta:
unique_together = ("document", "key", "value")
document = models.ForeignKey('Document', related_name='facets')
key = models.CharField(max_length=200, db_index=True)
value = models.CharField(max_length=1000, db_index=True)
sortvalue = models.CharField(max_length=1000, db_index=True)
2017-03-03 07:56:35 +00:00
def __str__(self):
2016-10-04 22:00:03 +00:00
return u"%s=%s" % (self.key, self.value)
def save(self, *args, **kwargs):
if not self.sortvalue:
self.sortvalue = utils.sort_string(self.value).lower()[:900]
self.sotvalue = self.sortvalue.lower()
super(Facet, self).save(*args, **kwargs)
Document.facet_keys = []
for key in settings.CONFIG['documentKeys']:
if 'autocomplete' in key and 'autocompleteSortKey' not in key or \
key.get('filter'):
Document.facet_keys.append(key['id'])
Document.person_keys = []
for key in settings.CONFIG['itemKeys']:
if key.get('sortType') == 'person':
Document.person_keys.append(key['id'])
2017-03-03 07:56:35 +00:00
@python_2_unicode_compatible
2016-10-04 22:00:03 +00:00
class Find(models.Model):
class Meta:
unique_together = ('document', 'key')
document = models.ForeignKey('Document', related_name='find', db_index=True)
key = models.CharField(max_length=200, db_index=True)
value = models.TextField(blank=True, db_index=settings.DB_GIN_TRGM)
2017-03-03 07:56:35 +00:00
def __str__(self):
2016-10-04 22:00:03 +00:00
return u'%s=%s' % (self.key, self.value)
'''
Sort
table constructed based on info in settings.CONFIG['documentKeys']
'''
attrs = {
'__module__': 'document.models',
'document': models.OneToOneField('Document', related_name='sort', primary_key=True),
'created': models.DateTimeField(null=True, blank=True, db_index=True),
}
2017-02-16 13:24:51 +00:00
for key in list(filter(lambda k: k.get('sort', False) or k['type'] in ('integer', 'time', 'float', 'date', 'enum'), settings.CONFIG['documentKeys'])):
2016-10-04 22:00:03 +00:00
name = key['id']
sort_type = key.get('sortType', key['type'])
if isinstance(sort_type, list):
sort_type = sort_type[0]
field = get_sort_field(sort_type)
if name not in attrs:
attrs[name] = field[0](**field[1])
Sort = type('Sort', (models.Model,), attrs)
Sort.fields = [f.name for f in Sort._meta.fields]