2013-03-24 12:28:57 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
2016-08-23 10:27:06 +00:00
|
|
|
from __future__ import division, print_function, absolute_import
|
|
|
|
|
2013-03-24 12:28:57 +00:00
|
|
|
import os
|
|
|
|
import re
|
2014-02-14 13:35:40 +00:00
|
|
|
from glob import glob
|
2016-10-04 22:00:03 +00:00
|
|
|
import unicodedata
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2017-01-24 15:19:34 +00:00
|
|
|
from six import PY2, string_types
|
2016-09-20 13:59:49 +00:00
|
|
|
from six.moves.urllib.parse import quote, unquote
|
2016-10-04 22:00:03 +00:00
|
|
|
from django.db import models, transaction
|
|
|
|
from django.db.models import Q, Sum, Max
|
2018-07-29 20:12:56 +00:00
|
|
|
from django.contrib.auth import get_user_model
|
2013-03-24 12:28:57 +00:00
|
|
|
from django.db.models.signals import pre_delete
|
2016-10-04 22:00:03 +00:00
|
|
|
from django.conf import settings
|
2017-03-03 07:56:35 +00:00
|
|
|
from django.utils.encoding import python_2_unicode_compatible
|
2018-06-19 18:48:18 +00:00
|
|
|
from oxdjango.fields import JSONField
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2016-06-25 18:36:20 +00:00
|
|
|
from PIL import Image
|
2013-03-24 12:28:57 +00:00
|
|
|
import ox
|
|
|
|
|
2016-10-04 22:00:03 +00:00
|
|
|
|
|
|
|
from oxdjango.sortmodel import get_sort_field
|
|
|
|
from person.models import get_name_sort
|
2013-05-27 11:21:08 +00:00
|
|
|
from item.models import Item
|
2017-01-26 15:56:28 +00:00
|
|
|
from annotation.models import Annotation
|
2013-12-23 12:32:23 +00:00
|
|
|
from archive.extract import resize_image
|
2014-04-11 16:56:22 +00:00
|
|
|
from archive.chunk import save_chunk
|
2018-07-29 20:28:46 +00:00
|
|
|
from user.models import Group
|
2013-05-27 11:21:08 +00:00
|
|
|
|
2016-08-23 10:27:06 +00:00
|
|
|
from . import managers
|
|
|
|
from . import utils
|
2019-11-17 12:02:12 +00:00
|
|
|
from .fulltext import FulltextMixin
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2018-07-29 20:12:56 +00:00
|
|
|
User = get_user_model()
|
|
|
|
|
2017-01-24 15:19:34 +00:00
|
|
|
if not PY2:
|
|
|
|
unicode = str
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2016-10-04 22:00:03 +00:00
|
|
|
def get_path(f, x):
|
|
|
|
return f.path(x)
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2017-03-03 07:56:35 +00:00
|
|
|
@python_2_unicode_compatible
|
2019-11-17 12:02:12 +00:00
|
|
|
class Document(models.Model, FulltextMixin):
|
2013-03-24 12:28:57 +00:00
|
|
|
|
|
|
|
created = models.DateTimeField(auto_now_add=True)
|
|
|
|
modified = models.DateTimeField(auto_now=True)
|
|
|
|
|
2016-10-04 22:00:03 +00:00
|
|
|
user = models.ForeignKey(User, related_name='documents')
|
|
|
|
groups = models.ManyToManyField(Group, blank=True, related_name='documents')
|
|
|
|
|
2013-03-24 12:28:57 +00:00
|
|
|
extension = models.CharField(max_length=255)
|
|
|
|
size = models.IntegerField(default=0)
|
|
|
|
matches = models.IntegerField(default=0)
|
2017-01-25 16:41:06 +00:00
|
|
|
ratio = models.FloatField(default=-1)
|
2014-01-04 12:29:11 +00:00
|
|
|
pages = models.IntegerField(default=-1)
|
|
|
|
width = models.IntegerField(default=-1)
|
|
|
|
height = models.IntegerField(default=-1)
|
2016-10-04 22:00:03 +00:00
|
|
|
|
2013-03-24 12:28:57 +00:00
|
|
|
oshash = models.CharField(max_length=16, unique=True, null=True)
|
|
|
|
|
2016-10-04 22:00:03 +00:00
|
|
|
file = models.FileField(default=None, blank=True, null=True, upload_to=get_path)
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2013-05-27 10:13:59 +00:00
|
|
|
objects = managers.DocumentManager()
|
2016-10-04 22:00:03 +00:00
|
|
|
uploading = models.BooleanField(default=False)
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2013-05-27 11:21:08 +00:00
|
|
|
items = models.ManyToManyField(Item, through='ItemProperties', related_name='documents')
|
2017-01-26 15:56:28 +00:00
|
|
|
annotations = models.ManyToManyField(Annotation, related_name='documents')
|
|
|
|
linked_documents = models.ManyToManyField('Document', related_name='linking_documents')
|
2013-05-27 11:21:08 +00:00
|
|
|
|
2016-10-04 22:00:03 +00:00
|
|
|
rightslevel = models.IntegerField(db_index=True, default=0)
|
2018-06-19 18:48:18 +00:00
|
|
|
data = JSONField(default=dict, editable=False)
|
2016-10-04 22:00:03 +00:00
|
|
|
|
|
|
|
def update_access(self, user):
|
|
|
|
if not user.is_authenticated():
|
|
|
|
user = None
|
|
|
|
access, created = Access.objects.get_or_create(document=self, user=user)
|
|
|
|
if not created:
|
|
|
|
access.save()
|
|
|
|
|
|
|
|
def update_facet(self, key):
|
|
|
|
current_values = self.get_value(key, [])
|
|
|
|
if key == 'name':
|
|
|
|
current_values = []
|
|
|
|
for k in settings.CONFIG['documentKeys']:
|
|
|
|
if k.get('sortType') == 'person':
|
|
|
|
current_values += self.get(k['id'], [])
|
|
|
|
if not isinstance(current_values, list):
|
|
|
|
if not current_values:
|
|
|
|
current_values = []
|
|
|
|
else:
|
|
|
|
current_values = [unicode(current_values)]
|
|
|
|
|
|
|
|
filter_map = utils.get_by_id(settings.CONFIG['documentKeys'], key).get('filterMap')
|
|
|
|
if filter_map:
|
|
|
|
filter_map = re.compile(filter_map)
|
|
|
|
_current_values = []
|
|
|
|
for value in current_values:
|
|
|
|
value = filter_map.findall(value)
|
|
|
|
if value:
|
|
|
|
_current_values.append(value[0])
|
|
|
|
current_values = _current_values
|
|
|
|
|
|
|
|
current_values = list(set(current_values))
|
|
|
|
current_values = [ox.decode_html(ox.strip_tags(v)) for v in current_values]
|
|
|
|
current_values = [unicodedata.normalize('NFKD', v) for v in current_values]
|
|
|
|
self.update_facet_values(key, current_values)
|
|
|
|
|
|
|
|
def update_facet_values(self, key, current_values):
|
|
|
|
current_sortvalues = set([value.lower() for value in current_values])
|
|
|
|
saved_values = [i.value.lower() for i in Facet.objects.filter(document=self, key=key)]
|
2017-02-16 13:24:51 +00:00
|
|
|
removed_values = list(filter(lambda i: i not in current_sortvalues, saved_values))
|
2016-10-04 22:00:03 +00:00
|
|
|
|
|
|
|
if removed_values:
|
|
|
|
q = Q()
|
|
|
|
for v in removed_values:
|
|
|
|
q |= Q(value__iexact=v)
|
2017-01-25 20:45:54 +00:00
|
|
|
r = Facet.objects.filter(document=self, key=key).filter(q).delete()
|
2016-10-04 22:00:03 +00:00
|
|
|
|
|
|
|
for value in current_values:
|
|
|
|
if value.lower() not in saved_values:
|
|
|
|
sortvalue = value
|
|
|
|
if key in self.person_keys + ['name']:
|
|
|
|
sortvalue = get_name_sort(value)
|
|
|
|
sortvalue = utils.sort_string(sortvalue).lower()[:900]
|
|
|
|
f, created = Facet.objects.get_or_create(document=self, key=key, value=value, sortvalue=sortvalue)
|
|
|
|
if created:
|
|
|
|
Facet.objects.filter(document=self, key=key, value__iexact=value).exclude(value=value).delete()
|
|
|
|
Facet.objects.filter(key=key, value__iexact=value).exclude(value=value).update(value=value)
|
|
|
|
saved_values.append(value.lower())
|
|
|
|
|
|
|
|
def update_facets(self):
|
|
|
|
for key in set(self.facet_keys + ['title']):
|
|
|
|
self.update_facet(key)
|
|
|
|
|
|
|
|
def update_find(self):
|
|
|
|
|
|
|
|
def save(key, value):
|
|
|
|
if value not in ('', None):
|
|
|
|
f, created = Find.objects.get_or_create(document=self, key=key)
|
|
|
|
if isinstance(value, bool):
|
|
|
|
value = value and 'true' or 'false'
|
|
|
|
if isinstance(value, string_types):
|
|
|
|
value = ox.decode_html(ox.strip_tags(value.strip()))
|
|
|
|
value = unicodedata.normalize('NFKD', value).lower()
|
|
|
|
f.value = value
|
|
|
|
f.save()
|
|
|
|
else:
|
|
|
|
Find.objects.filter(document=self, key=key).delete()
|
|
|
|
|
|
|
|
with transaction.atomic():
|
|
|
|
data = self.json()
|
|
|
|
for key in settings.CONFIG['documentKeys']:
|
|
|
|
i = key['id']
|
|
|
|
if i == 'rightslevel':
|
|
|
|
save(i, self.rightslevel)
|
2019-12-01 14:25:45 +00:00
|
|
|
if key.get('fulltext'):
|
|
|
|
continue
|
|
|
|
elif i not in ('*', 'dimensions') and i not in self.facet_keys:
|
2016-10-04 22:00:03 +00:00
|
|
|
value = data.get(i)
|
|
|
|
if isinstance(value, list):
|
|
|
|
value = u'\n'.join(value)
|
|
|
|
save(i, value)
|
|
|
|
|
|
|
|
base_keys = ('id', 'size', 'dimensions', 'extension', 'matches')
|
|
|
|
|
|
|
|
def update_sort(self):
|
|
|
|
try:
|
|
|
|
s = self.sort
|
|
|
|
except Sort.DoesNotExist:
|
|
|
|
s = Sort(document=self)
|
|
|
|
|
|
|
|
s.id = self.id
|
|
|
|
s.extension = self.extension
|
|
|
|
s.size = self.size
|
|
|
|
s.matches = self.matches
|
2016-11-30 10:36:29 +00:00
|
|
|
|
2016-10-04 22:00:03 +00:00
|
|
|
if self.extension == 'pdf':
|
2016-11-30 10:36:29 +00:00
|
|
|
prefix = 2
|
|
|
|
value = self.pages
|
2016-10-04 22:00:03 +00:00
|
|
|
else:
|
|
|
|
if self.extension == 'html':
|
2016-11-30 10:36:29 +00:00
|
|
|
prefix = 1
|
|
|
|
value = self.dimensions
|
2016-10-04 22:00:03 +00:00
|
|
|
else:
|
2016-11-30 10:36:29 +00:00
|
|
|
prefix = 0
|
|
|
|
value = self.width * self.height
|
|
|
|
if value < 0:
|
|
|
|
value = 0
|
|
|
|
s.dimensions = ox.sort_string('%d' % prefix) + ox.sort_string('%d' % value)
|
2016-10-04 22:00:03 +00:00
|
|
|
|
|
|
|
def sortNames(values):
|
|
|
|
sort_value = u''
|
|
|
|
if values:
|
|
|
|
sort_value = u'; '.join([get_name_sort(name) for name in values])
|
|
|
|
if not sort_value:
|
|
|
|
sort_value = u''
|
2019-06-28 11:06:00 +00:00
|
|
|
return sort_value.lower()
|
2016-10-04 22:00:03 +00:00
|
|
|
|
|
|
|
def set_value(s, name, value):
|
|
|
|
if isinstance(value, string_types):
|
|
|
|
value = ox.decode_html(value.lower())
|
|
|
|
if not value:
|
|
|
|
value = None
|
|
|
|
setattr(s, name, value)
|
|
|
|
|
|
|
|
def get_value(source, key):
|
2017-01-26 15:56:28 +00:00
|
|
|
value = self.get_value(source)
|
2016-10-04 22:00:03 +00:00
|
|
|
return value
|
|
|
|
|
|
|
|
def get_words(source, key):
|
|
|
|
value = get_value(source, key)
|
|
|
|
if isinstance(value, list):
|
|
|
|
value = '\n'.join(value)
|
|
|
|
value = len(value.split(' ')) if value else 0
|
|
|
|
return value
|
|
|
|
|
2017-02-16 13:24:51 +00:00
|
|
|
for key in list(filter(lambda k: k.get('sort', False), settings.CONFIG['documentKeys'])):
|
2016-10-04 22:00:03 +00:00
|
|
|
name = key['id']
|
|
|
|
if name not in self.base_keys:
|
|
|
|
source = name
|
|
|
|
sort_type = key.get('sortType', key['type'])
|
|
|
|
if 'value' in key:
|
|
|
|
if 'key' in key['value']:
|
|
|
|
source = key['value']['key']
|
|
|
|
sort_type = key['value'].get('type', sort_type)
|
|
|
|
if isinstance(sort_type, list):
|
|
|
|
sort_type = sort_type[0]
|
|
|
|
if sort_type == 'title':
|
|
|
|
value = self.get_value(source, u'Untitled')
|
|
|
|
value = utils.sort_title(value)[:955]
|
|
|
|
set_value(s, name, value)
|
|
|
|
elif sort_type == 'person':
|
|
|
|
value = sortNames(self.get_value(source, []))
|
|
|
|
value = utils.sort_string(value)[:955]
|
|
|
|
set_value(s, name, value)
|
|
|
|
elif sort_type == 'string':
|
|
|
|
value = self.get_value(source, u'')
|
|
|
|
if isinstance(value, list):
|
|
|
|
value = u','.join(value)
|
2018-12-04 18:14:24 +00:00
|
|
|
if not isinstance(value, str):
|
|
|
|
value = str(value)
|
2016-10-04 22:00:03 +00:00
|
|
|
value = utils.sort_string(value)[:955]
|
|
|
|
set_value(s, name, value)
|
|
|
|
elif sort_type == 'words':
|
|
|
|
value = get_words(source, key) if s.duration else None
|
|
|
|
set_value(s, name, value)
|
|
|
|
elif sort_type == 'wordsperminute':
|
|
|
|
value = get_words(source, key)
|
|
|
|
value = value / (s.duration / 60) if value and s.duration else None
|
|
|
|
set_value(s, name, value)
|
|
|
|
elif sort_type in ('length', 'integer', 'time', 'float'):
|
|
|
|
# can be length of strings or length of arrays, i.e. keywords
|
2017-01-26 15:56:28 +00:00
|
|
|
value = self.get_value(source)
|
2016-10-04 22:00:03 +00:00
|
|
|
if isinstance(value, list):
|
|
|
|
value = len(value)
|
|
|
|
set_value(s, name, value)
|
|
|
|
elif sort_type == 'year':
|
|
|
|
value = self.get_value(source)
|
|
|
|
set_value(s, name, value)
|
|
|
|
elif sort_type == 'date':
|
|
|
|
value = self.get_value(source)
|
|
|
|
if isinstance(value, string_types):
|
|
|
|
value = datetime_safe.datetime.strptime(value, '%Y-%m-%d')
|
|
|
|
set_value(s, name, value)
|
|
|
|
s.save()
|
|
|
|
|
2013-03-24 12:28:57 +00:00
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
if not self.uploading:
|
|
|
|
if self.file:
|
|
|
|
self.size = self.file.size
|
2014-01-04 12:29:11 +00:00
|
|
|
self.get_info()
|
2016-10-04 22:00:03 +00:00
|
|
|
if self.extension == 'html':
|
|
|
|
self.size = len(self.data.get('text', ''))
|
2016-11-30 10:36:29 +00:00
|
|
|
|
|
|
|
is_ready = not self.uploading and (self.file or self.extension == 'html')
|
|
|
|
|
2016-10-04 22:00:03 +00:00
|
|
|
if self.id:
|
2016-11-30 10:36:29 +00:00
|
|
|
if is_ready:
|
|
|
|
self.update_sort()
|
|
|
|
self.update_find()
|
|
|
|
self.update_facets()
|
2016-10-04 22:00:03 +00:00
|
|
|
new = False
|
2014-01-07 16:19:27 +00:00
|
|
|
else:
|
2016-10-04 22:00:03 +00:00
|
|
|
new = True
|
2013-05-27 10:13:59 +00:00
|
|
|
super(Document, self).save(*args, **kwargs)
|
2016-10-04 22:00:03 +00:00
|
|
|
if new:
|
2016-11-30 10:36:29 +00:00
|
|
|
if is_ready:
|
|
|
|
self.update_sort()
|
|
|
|
self.update_find()
|
|
|
|
self.update_facets()
|
2014-01-04 12:29:11 +00:00
|
|
|
self.update_matches()
|
2017-01-26 15:56:28 +00:00
|
|
|
self.update_linked_documents()
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2017-03-03 07:56:35 +00:00
|
|
|
def __str__(self):
|
2013-03-24 12:28:57 +00:00
|
|
|
return self.get_id()
|
|
|
|
|
2013-05-27 11:21:08 +00:00
|
|
|
def add(self, item):
|
|
|
|
p, created = ItemProperties.objects.get_or_create(item=item, document=self)
|
|
|
|
if created:
|
|
|
|
p.index = ItemProperties.objects.filter(item=item).aggregate(Max('index'))['index__max'] + 1
|
|
|
|
p.save()
|
2014-01-04 12:29:11 +00:00
|
|
|
p.document.update_matches()
|
2014-02-03 08:54:37 +00:00
|
|
|
item.update_sort()
|
2013-05-27 11:21:08 +00:00
|
|
|
|
|
|
|
def remove(self, item):
|
|
|
|
ItemProperties.objects.filter(item=item, document=self).delete()
|
|
|
|
|
2013-03-24 12:28:57 +00:00
|
|
|
@classmethod
|
|
|
|
def get(cls, id):
|
2013-12-23 11:26:48 +00:00
|
|
|
return cls.objects.get(pk=ox.fromAZ(id))
|
2013-03-24 12:28:57 +00:00
|
|
|
|
|
|
|
def get_absolute_url(self):
|
2013-05-27 20:03:18 +00:00
|
|
|
return ('/documents/%s' % quote(self.get_id())).replace('%3A', ':')
|
2013-03-24 12:28:57 +00:00
|
|
|
|
|
|
|
def get_id(self):
|
2013-12-23 11:26:48 +00:00
|
|
|
return ox.toAZ(self.id)
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2017-01-24 23:07:04 +00:00
|
|
|
def access(self, user):
|
|
|
|
if user.is_anonymous():
|
|
|
|
level = 'guest'
|
|
|
|
else:
|
|
|
|
level = user.profile.get_level()
|
|
|
|
editable = self.editable(user)
|
|
|
|
if editable:
|
|
|
|
return True
|
|
|
|
allowed_level = settings.CONFIG['capabilities']['canSeeDocument'][level]
|
|
|
|
if self.rightslevel <= allowed_level:
|
|
|
|
return True
|
|
|
|
return False
|
2016-10-04 22:00:03 +00:00
|
|
|
|
2014-04-17 19:32:30 +00:00
|
|
|
def editable(self, user, item=None):
|
2013-03-24 12:28:57 +00:00
|
|
|
if not user or user.is_anonymous():
|
|
|
|
return False
|
|
|
|
if self.user == user or \
|
|
|
|
user.is_staff or \
|
2016-10-04 22:00:03 +00:00
|
|
|
user.profile.capability('canEditDocuments') is True or \
|
2014-04-17 19:32:30 +00:00
|
|
|
(item and item.editable(user)):
|
2013-03-24 12:28:57 +00:00
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2013-05-27 11:21:08 +00:00
|
|
|
def edit(self, data, user, item=None):
|
|
|
|
if item:
|
|
|
|
p, created = ItemProperties.objects.get_or_create(item=item, document=self)
|
|
|
|
if 'description' in data:
|
|
|
|
p.description = ox.sanitize_html(data['description'])
|
|
|
|
p.save()
|
2016-10-04 22:00:03 +00:00
|
|
|
else:
|
|
|
|
for key in data:
|
|
|
|
k = list(filter(lambda i: i['id'] == key, settings.CONFIG['documentKeys']))
|
|
|
|
ktype = k and k[0].get('type') or ''
|
|
|
|
if key == 'text' and self.extension == 'html':
|
|
|
|
self.data['text'] = ox.sanitize_html(data['text'], global_attributes=[
|
|
|
|
'data-name',
|
|
|
|
'data-type',
|
|
|
|
'data-value',
|
|
|
|
'lang'
|
|
|
|
])
|
2017-01-24 23:07:04 +00:00
|
|
|
elif key == 'rightslevel':
|
|
|
|
setattr(self, key, int(data[key]))
|
2016-10-04 22:00:03 +00:00
|
|
|
elif ktype == 'text':
|
2017-03-12 12:50:13 +00:00
|
|
|
if data[key]:
|
|
|
|
self.data[key] = ox.sanitize_html(data[key])
|
2017-03-12 12:51:52 +00:00
|
|
|
elif key in self.data:
|
2017-03-12 12:50:13 +00:00
|
|
|
del self.data[key]
|
2016-10-04 22:00:03 +00:00
|
|
|
elif ktype == '[text]':
|
|
|
|
self.data[key] = [ox.sanitize_html(t) for t in data[key]]
|
|
|
|
elif ktype == '[string]':
|
|
|
|
self.data[key] = [ox.escape_html(t) for t in data[key]]
|
|
|
|
elif isinstance(data[key], string_types):
|
|
|
|
self.data[key] = ox.escape_html(data[key])
|
|
|
|
elif isinstance(data[key], list):
|
|
|
|
def cleanup(i):
|
|
|
|
if isinstance(i, string_types):
|
|
|
|
i = ox.escape_html(i)
|
|
|
|
return i
|
|
|
|
self.data[key] = [cleanup(i) for i in data[key]]
|
|
|
|
elif isinstance(data[key], int) or isinstance(data[key], float):
|
|
|
|
self.data[key] = data[key]
|
|
|
|
else:
|
2017-01-25 20:45:54 +00:00
|
|
|
if data[key]:
|
|
|
|
self.data[key] = ox.escape_html(data[key])
|
2017-02-21 16:46:16 +00:00
|
|
|
elif key in self.data:
|
2017-01-25 20:45:54 +00:00
|
|
|
del self.data[key]
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2014-01-07 11:05:10 +00:00
|
|
|
@property
|
|
|
|
def dimensions(self):
|
|
|
|
if self.extension == 'pdf':
|
|
|
|
return self.pages
|
2016-10-04 22:00:03 +00:00
|
|
|
elif self.extension == 'html':
|
|
|
|
return len(self.data.get('text', '').split(' '))
|
2014-01-07 11:05:10 +00:00
|
|
|
else:
|
|
|
|
return self.resolution
|
|
|
|
|
2014-01-04 12:29:11 +00:00
|
|
|
@property
|
|
|
|
def resolution(self):
|
|
|
|
return [self.width, self.height]
|
|
|
|
|
2016-10-04 22:00:03 +00:00
|
|
|
def get_value(self, key, default=None):
|
|
|
|
if key in (
|
|
|
|
'extension',
|
|
|
|
'id',
|
|
|
|
'matches',
|
|
|
|
'ratio',
|
|
|
|
'size',
|
2017-01-24 23:07:04 +00:00
|
|
|
'rightslevel',
|
2016-10-04 22:00:03 +00:00
|
|
|
):
|
|
|
|
return getattr(self, key)
|
2019-06-11 11:30:18 +00:00
|
|
|
document_key = utils.get_by_id(settings.CONFIG['documentKeys'], key)
|
|
|
|
if document_key and 'value' in document_key \
|
|
|
|
and isinstance(document_key['value'], dict) \
|
|
|
|
and document_key['value'].get('type') == 'map' \
|
2019-06-11 11:32:00 +00:00
|
|
|
and self.get_value(document_key['value']['key']):
|
2019-06-11 11:30:18 +00:00
|
|
|
value = re.compile(document_key['value']['map']).findall(self.get_value(document_key['value']['key']))
|
2019-12-05 16:41:03 +00:00
|
|
|
if value and document_key['value'].get('format'):
|
|
|
|
value = [document_key['value']['format'].format(value[0])]
|
2019-06-11 11:30:18 +00:00
|
|
|
return value[0] if value else default
|
2016-10-04 22:00:03 +00:00
|
|
|
elif key == 'user':
|
|
|
|
return self.user.username
|
|
|
|
else:
|
|
|
|
return self.data.get(key, default)
|
|
|
|
|
2013-05-27 11:21:08 +00:00
|
|
|
def json(self, keys=None, user=None, item=None):
|
2013-03-24 12:28:57 +00:00
|
|
|
if not keys:
|
2016-10-04 22:00:03 +00:00
|
|
|
keys = [
|
2013-03-24 12:28:57 +00:00
|
|
|
'description',
|
2014-01-07 11:05:10 +00:00
|
|
|
'dimensions',
|
2013-03-24 12:28:57 +00:00
|
|
|
'editable',
|
2015-02-13 11:06:09 +00:00
|
|
|
'entities',
|
|
|
|
'extension',
|
2013-03-24 12:28:57 +00:00
|
|
|
'id',
|
|
|
|
'oshash',
|
2016-10-04 22:00:03 +00:00
|
|
|
'title',
|
2013-03-24 12:28:57 +00:00
|
|
|
'ratio',
|
2016-10-04 22:00:03 +00:00
|
|
|
'matches',
|
2015-02-13 11:06:09 +00:00
|
|
|
'size',
|
2014-01-07 11:05:10 +00:00
|
|
|
'user',
|
2017-01-25 20:45:54 +00:00
|
|
|
'referenced',
|
2013-03-24 12:28:57 +00:00
|
|
|
]
|
2016-10-04 22:00:03 +00:00
|
|
|
if self.extension in ('html', 'txt'):
|
|
|
|
keys.append('text')
|
|
|
|
for key in settings.CONFIG['documentKeys']:
|
|
|
|
if key['id'] in ('*', ):
|
|
|
|
continue
|
|
|
|
if key['id'] not in keys:
|
|
|
|
keys.append(key['id'])
|
2013-03-24 12:28:57 +00:00
|
|
|
response = {}
|
|
|
|
_map = {
|
|
|
|
}
|
|
|
|
for key in keys:
|
|
|
|
if key == 'id':
|
|
|
|
response[key] = self.get_id()
|
|
|
|
elif key == 'editable':
|
|
|
|
response[key] = self.editable(user)
|
|
|
|
elif key == 'user':
|
|
|
|
response[key] = self.user.username
|
2016-10-04 22:00:03 +00:00
|
|
|
elif key == 'accessed':
|
|
|
|
response[key] = self.accessed.aggregate(Max('access'))['access__max']
|
|
|
|
elif key == 'timesaccessed':
|
|
|
|
response[key] = self.accessed.aggregate(Sum('accessed'))['accessed__sum']
|
2015-02-13 11:06:09 +00:00
|
|
|
elif key == 'entities':
|
2016-02-19 10:59:15 +00:00
|
|
|
dps = self.documentproperties.select_related('entity').order_by('index')
|
|
|
|
response[key] = entity_jsons = []
|
|
|
|
for dp in dps:
|
|
|
|
entity_json = dp.entity.json(['id', 'type', 'name'])
|
|
|
|
entity_json['data'] = dp.data
|
|
|
|
entity_jsons.append(entity_json)
|
2017-01-25 20:45:54 +00:00
|
|
|
elif key == 'referenced':
|
|
|
|
response[key] = self.referenced()
|
2016-10-04 22:00:03 +00:00
|
|
|
elif key in self.data:
|
|
|
|
response[key] = self.data[key]
|
2013-03-24 12:28:57 +00:00
|
|
|
elif hasattr(self, _map.get(key, key)):
|
2016-10-04 22:00:03 +00:00
|
|
|
response[key] = getattr(self, _map.get(key, key)) or ''
|
|
|
|
if self.extension == 'html':
|
|
|
|
response['text'] = self.data.get('text', '')
|
2013-05-27 11:21:08 +00:00
|
|
|
if item:
|
2016-08-23 10:27:06 +00:00
|
|
|
if isinstance(item, string_types):
|
2014-09-19 12:26:46 +00:00
|
|
|
item = Item.objects.get(public_id=item)
|
2013-05-27 11:21:08 +00:00
|
|
|
d = self.descriptions.filter(item=item)
|
|
|
|
if d.exists():
|
|
|
|
if 'description' in keys and d[0].description:
|
|
|
|
response['description'] = d[0].description
|
|
|
|
response['index'] = d[0].index
|
2017-01-25 16:41:06 +00:00
|
|
|
if response.get('ratio') == -1:
|
|
|
|
response['ratio'] = settings.CONFIG['posters']['ratio']
|
2016-10-04 22:00:03 +00:00
|
|
|
if keys:
|
|
|
|
for key in list(response):
|
|
|
|
if key not in keys:
|
|
|
|
del response[key]
|
2013-03-24 12:28:57 +00:00
|
|
|
return response
|
|
|
|
|
|
|
|
def path(self, name=''):
|
2013-12-23 11:30:22 +00:00
|
|
|
h = ox.toAZ(self.id)
|
|
|
|
h = (7-len(h))*'0' + h
|
2013-05-27 10:13:59 +00:00
|
|
|
return os.path.join('documents', h[:2], h[2:4], h[4:6], h[6:], name)
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2014-04-11 12:08:14 +00:00
|
|
|
def save_chunk(self, chunk, offset=None, done=False):
|
2013-03-24 12:28:57 +00:00
|
|
|
if self.uploading:
|
2014-04-11 16:56:22 +00:00
|
|
|
name = 'data.%s' % self.extension
|
|
|
|
name = self.path(name)
|
|
|
|
|
|
|
|
def done_cb():
|
|
|
|
if done:
|
|
|
|
self.uploading = False
|
|
|
|
self.get_info()
|
|
|
|
self.get_ratio()
|
|
|
|
self.oshash = ox.oshash(self.file.path)
|
|
|
|
self.save()
|
2016-09-23 11:38:00 +00:00
|
|
|
self.delete_cache()
|
2019-12-02 18:16:13 +00:00
|
|
|
self.update_fulltext()
|
2014-04-11 16:56:22 +00:00
|
|
|
return True, self.file.size
|
|
|
|
|
|
|
|
return save_chunk(self, self.file, chunk, offset, name, done_cb)
|
|
|
|
return False, 0
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2014-02-02 06:30:58 +00:00
|
|
|
def thumbnail(self, size=None, page=None):
|
2016-10-04 22:00:03 +00:00
|
|
|
if not self.file:
|
2017-01-25 16:41:06 +00:00
|
|
|
return os.path.join(settings.STATIC_ROOT, 'png/document.png')
|
2013-12-23 12:32:23 +00:00
|
|
|
src = self.file.path
|
2014-02-02 06:30:58 +00:00
|
|
|
folder = os.path.dirname(src)
|
2013-12-23 12:32:23 +00:00
|
|
|
if size:
|
|
|
|
size = int(size)
|
2014-02-02 06:30:58 +00:00
|
|
|
path = os.path.join(folder, '%d.jpg' % size)
|
2013-12-23 11:26:48 +00:00
|
|
|
else:
|
2013-12-23 12:32:23 +00:00
|
|
|
path = src
|
2014-02-02 06:30:58 +00:00
|
|
|
if self.extension == 'pdf':
|
2019-12-02 12:38:56 +00:00
|
|
|
crop = []
|
2014-02-02 06:30:58 +00:00
|
|
|
if page:
|
2019-12-02 12:38:56 +00:00
|
|
|
if ',' in page:
|
|
|
|
crop = list(map(int, page.split(',')))
|
|
|
|
page = crop[0]
|
|
|
|
crop = crop[1:]
|
|
|
|
else:
|
|
|
|
page = int(page)
|
2014-02-02 06:30:58 +00:00
|
|
|
if page and page > 1 and page <= self.pages:
|
|
|
|
src = os.path.join(folder, '1024p%d.jpg' % page)
|
|
|
|
else:
|
|
|
|
src = os.path.join(folder, '1024p1.jpg')
|
|
|
|
page = 1
|
|
|
|
if not os.path.exists(src):
|
|
|
|
self.extract_page(page)
|
|
|
|
if size:
|
|
|
|
path = os.path.join(folder, '%dp%d.jpg' % (size, page))
|
2019-12-02 12:38:56 +00:00
|
|
|
if len(crop) == 4:
|
2019-12-02 12:40:52 +00:00
|
|
|
path = os.path.join(folder, '%dp%d,%s.jpg' % (1024, page, ','.join(map(str, crop))))
|
2019-12-02 12:38:56 +00:00
|
|
|
if not os.path.exists(path):
|
|
|
|
img = Image.open(src).crop(crop)
|
|
|
|
img.save(path)
|
|
|
|
else:
|
|
|
|
img = Image.open(path)
|
|
|
|
src = path
|
|
|
|
if size < max(img.size):
|
2019-12-02 12:40:52 +00:00
|
|
|
path = os.path.join(folder, '%dp%d,%s.jpg' % (size, page, ','.join(map(str, crop))))
|
2019-12-02 12:38:56 +00:00
|
|
|
if not os.path.exists(path):
|
|
|
|
resize_image(src, path, size=size)
|
2015-02-05 08:08:28 +00:00
|
|
|
elif self.extension in ('jpg', 'png', 'gif'):
|
|
|
|
if os.path.exists(src):
|
|
|
|
if size and page:
|
2016-08-23 10:27:06 +00:00
|
|
|
crop = list(map(int, page.split(',')))
|
2016-02-19 16:59:02 +00:00
|
|
|
if len(crop) == 4:
|
|
|
|
path = os.path.join(folder, '%s.jpg' % ','.join(map(str, crop)))
|
2015-02-05 08:08:28 +00:00
|
|
|
if not os.path.exists(path):
|
2016-02-19 16:59:02 +00:00
|
|
|
img = Image.open(src).crop(crop)
|
|
|
|
img.save(path)
|
|
|
|
else:
|
|
|
|
img = Image.open(path)
|
|
|
|
src = path
|
|
|
|
if size < max(img.size):
|
|
|
|
path = os.path.join(folder, '%sp%s.jpg' % (size, ','.join(map(str, crop))))
|
|
|
|
if not os.path.exists(path):
|
|
|
|
resize_image(src, path, size=size)
|
2013-12-23 12:32:23 +00:00
|
|
|
if os.path.exists(src) and not os.path.exists(path):
|
2014-01-04 12:29:11 +00:00
|
|
|
image_size = max(self.width, self.height)
|
|
|
|
if image_size == -1:
|
|
|
|
image_size = max(*Image.open(src).size)
|
2013-12-23 12:32:23 +00:00
|
|
|
if size > image_size:
|
|
|
|
path = src
|
|
|
|
else:
|
|
|
|
resize_image(src, path, size=size)
|
|
|
|
return path
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2014-02-02 06:30:58 +00:00
|
|
|
def extract_page(self, page):
|
|
|
|
pdf = self.file.path
|
|
|
|
image = os.path.join(os.path.dirname(pdf), '1024p%d.jpg' % page)
|
|
|
|
utils.extract_pdfpage(pdf, image, page)
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2014-01-04 12:29:11 +00:00
|
|
|
def get_info(self):
|
|
|
|
if self.extension == 'pdf':
|
2014-11-01 16:09:33 +00:00
|
|
|
self.thumbnail(1024)
|
2014-01-04 12:29:11 +00:00
|
|
|
if self.pages == -1:
|
2014-02-01 11:34:40 +00:00
|
|
|
self.width = -1
|
2014-01-04 12:29:11 +00:00
|
|
|
self.height = -1
|
|
|
|
self.pages = utils.pdfpages(self.file.path)
|
|
|
|
elif self.width == -1:
|
|
|
|
self.pages = -1
|
|
|
|
self.width, self.height = Image.open(self.file.path).size
|
|
|
|
|
2013-03-24 12:28:57 +00:00
|
|
|
def get_ratio(self):
|
|
|
|
if self.extension == 'pdf':
|
2014-11-01 13:42:16 +00:00
|
|
|
image = self.thumbnail(1024)
|
2014-01-04 12:29:11 +00:00
|
|
|
try:
|
|
|
|
size = Image.open(image).size
|
|
|
|
except:
|
2016-10-04 22:00:03 +00:00
|
|
|
size = [1, 1]
|
2014-01-05 11:40:54 +00:00
|
|
|
else:
|
|
|
|
if self.width > 0:
|
|
|
|
size = self.resolution
|
|
|
|
else:
|
2017-01-25 16:41:06 +00:00
|
|
|
size = [-1, 1]
|
2013-03-24 12:28:57 +00:00
|
|
|
self.ratio = size[0] / size[1]
|
2014-01-05 11:40:54 +00:00
|
|
|
return self.ratio
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2017-01-25 20:45:54 +00:00
|
|
|
def urls(self):
|
2013-03-24 12:53:32 +00:00
|
|
|
urls = [self.get_absolute_url()]
|
|
|
|
url = unquote(urls[0])
|
|
|
|
if url != urls[0]:
|
|
|
|
urls.append(url)
|
2017-01-25 20:45:54 +00:00
|
|
|
return urls
|
|
|
|
|
|
|
|
def referenced(self):
|
|
|
|
result = {}
|
2017-01-26 15:56:28 +00:00
|
|
|
result['items'] = [
|
2018-06-19 22:03:01 +00:00
|
|
|
i.json(keys=['id', 'title'])
|
2017-01-26 15:56:28 +00:00
|
|
|
for i in self.items.all().order_by('sort__title')
|
|
|
|
]
|
|
|
|
result['annotations'] = [
|
|
|
|
a.json(keys=['id', 'title', 'in'])
|
|
|
|
for a in self.annotations.all().order_by('start', 'end')
|
|
|
|
]
|
|
|
|
result['documents'] = [
|
|
|
|
d.json(keys=['id', 'title'])
|
|
|
|
for d in self.linking_documents.all().order_by('sort__title')
|
|
|
|
]
|
|
|
|
result['entities'] = [
|
|
|
|
e.json(keys=['id', 'name'])
|
|
|
|
for e in self.entities.all()
|
|
|
|
]
|
2017-01-25 20:45:54 +00:00
|
|
|
return result
|
|
|
|
|
2017-01-26 15:56:28 +00:00
|
|
|
def update_linked_documents(self):
|
|
|
|
if self.extension == 'html':
|
2017-02-16 17:31:24 +00:00
|
|
|
old = [d.id for d in self.linked_documents.all()]
|
2017-01-27 12:26:55 +00:00
|
|
|
current = utils.get_documents(self.data.get('text', ''))
|
2017-01-26 15:56:28 +00:00
|
|
|
removed = list(set(old) - set(current))
|
|
|
|
added = list(set(current) - set(old))
|
|
|
|
if removed:
|
|
|
|
for document in Document.objects.filter(id__in=removed):
|
|
|
|
self.linked_documents.remove(document)
|
|
|
|
if added:
|
|
|
|
for document in Document.objects.filter(id__in=added):
|
|
|
|
self.linked_documents.add(document)
|
|
|
|
|
2017-01-25 20:45:54 +00:00
|
|
|
def update_matches(self):
|
|
|
|
urls = self.urls()
|
2015-02-13 11:06:09 +00:00
|
|
|
matches = self.items.count() + self.entities.count()
|
2013-03-24 12:53:32 +00:00
|
|
|
for url in urls:
|
2017-01-26 15:56:28 +00:00
|
|
|
matches += Annotation.objects.filter(value__contains=url).count()
|
|
|
|
matches += Item.objects.filter(data__contains=url).count()
|
2017-01-25 20:45:54 +00:00
|
|
|
matches += Document.objects.filter(extension='html', data__contains=url).count()
|
2013-03-24 12:53:32 +00:00
|
|
|
if matches != self.matches:
|
2013-05-27 10:13:59 +00:00
|
|
|
Document.objects.filter(id=self.id).update(matches=matches)
|
2013-03-24 12:53:32 +00:00
|
|
|
self.matches = matches
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2016-09-23 11:38:00 +00:00
|
|
|
def delete_cache(self):
|
|
|
|
if self.file:
|
|
|
|
folder = os.path.dirname(self.file.path)
|
|
|
|
for f in glob('%s/*' % folder):
|
|
|
|
if f != self.file.path:
|
|
|
|
os.unlink(f)
|
|
|
|
|
2013-05-27 10:13:59 +00:00
|
|
|
def delete_document(sender, **kwargs):
|
2013-03-24 12:28:57 +00:00
|
|
|
t = kwargs['instance']
|
|
|
|
if t.file:
|
2016-09-23 11:38:00 +00:00
|
|
|
t.delete_cache()
|
2016-02-26 11:00:21 +00:00
|
|
|
t.file.delete(save=False)
|
2019-12-01 16:35:54 +00:00
|
|
|
t.delete_fulltext()
|
2013-05-27 10:13:59 +00:00
|
|
|
pre_delete.connect(delete_document, sender=Document)
|
2013-03-24 12:28:57 +00:00
|
|
|
|
2013-05-27 11:21:08 +00:00
|
|
|
class ItemProperties(models.Model):
|
|
|
|
|
|
|
|
class Meta:
|
|
|
|
unique_together = ("item", "document")
|
|
|
|
|
|
|
|
created = models.DateTimeField(auto_now_add=True)
|
|
|
|
modified = models.DateTimeField(auto_now=True)
|
|
|
|
|
|
|
|
item = models.ForeignKey(Item)
|
|
|
|
document = models.ForeignKey(Document, related_name='descriptions')
|
|
|
|
description = models.TextField(default="")
|
|
|
|
index = models.IntegerField(default=0)
|
|
|
|
|
2014-01-08 04:32:57 +00:00
|
|
|
description_sort = models.CharField(max_length=512, null=True)
|
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
if self.description:
|
|
|
|
self.description_sort = ox.sort_string(self.description)[:512].lower()
|
|
|
|
else:
|
2016-10-04 22:00:03 +00:00
|
|
|
self.description_sort = self.document.sort.description
|
2014-01-08 04:32:57 +00:00
|
|
|
|
|
|
|
super(ItemProperties, self).save(*args, **kwargs)
|
2016-10-04 22:00:03 +00:00
|
|
|
|
|
|
|
|
2017-03-03 07:56:35 +00:00
|
|
|
@python_2_unicode_compatible
|
2016-10-04 22:00:03 +00:00
|
|
|
class Access(models.Model):
|
|
|
|
class Meta:
|
|
|
|
unique_together = ("document", "user")
|
|
|
|
|
|
|
|
access = models.DateTimeField(auto_now=True)
|
|
|
|
document = models.ForeignKey(Document, related_name='accessed')
|
|
|
|
user = models.ForeignKey(User, null=True, related_name='accessed_documents')
|
|
|
|
accessed = models.IntegerField(default=0)
|
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
if not self.accessed:
|
|
|
|
self.accessed = 0
|
|
|
|
self.accessed += 1
|
|
|
|
super(Access, self).save(*args, **kwargs)
|
|
|
|
timesaccessed = Access.objects.filter(document=self.document).aggregate(Sum('accessed'))['accessed__sum']
|
|
|
|
Sort.objects.filter(document=self.document).update(timesaccessed=timesaccessed, accessed=self.access)
|
|
|
|
|
2017-03-03 07:56:35 +00:00
|
|
|
def __str__(self):
|
2016-10-04 22:00:03 +00:00
|
|
|
if self.user:
|
|
|
|
return u"%s/%s/%s" % (self.user, self.document, self.access)
|
|
|
|
return u"%s/%s" % (self.item, self.access)
|
|
|
|
|
2017-03-03 07:56:35 +00:00
|
|
|
@python_2_unicode_compatible
|
2016-10-04 22:00:03 +00:00
|
|
|
class Facet(models.Model):
|
|
|
|
'''
|
|
|
|
used for keys that can have multiple values like people, languages etc.
|
|
|
|
does not perform to well if total number of items goes above 10k
|
|
|
|
this happens for keywords in 0xdb right now
|
|
|
|
'''
|
|
|
|
|
|
|
|
class Meta:
|
|
|
|
unique_together = ("document", "key", "value")
|
|
|
|
|
|
|
|
document = models.ForeignKey('Document', related_name='facets')
|
|
|
|
key = models.CharField(max_length=200, db_index=True)
|
|
|
|
value = models.CharField(max_length=1000, db_index=True)
|
|
|
|
sortvalue = models.CharField(max_length=1000, db_index=True)
|
|
|
|
|
2017-03-03 07:56:35 +00:00
|
|
|
def __str__(self):
|
2016-10-04 22:00:03 +00:00
|
|
|
return u"%s=%s" % (self.key, self.value)
|
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
if not self.sortvalue:
|
|
|
|
self.sortvalue = utils.sort_string(self.value).lower()[:900]
|
|
|
|
self.sotvalue = self.sortvalue.lower()
|
|
|
|
super(Facet, self).save(*args, **kwargs)
|
|
|
|
|
|
|
|
Document.facet_keys = []
|
|
|
|
for key in settings.CONFIG['documentKeys']:
|
|
|
|
if 'autocomplete' in key and 'autocompleteSortKey' not in key or \
|
|
|
|
key.get('filter'):
|
|
|
|
Document.facet_keys.append(key['id'])
|
|
|
|
|
|
|
|
Document.person_keys = []
|
|
|
|
for key in settings.CONFIG['itemKeys']:
|
|
|
|
if key.get('sortType') == 'person':
|
|
|
|
Document.person_keys.append(key['id'])
|
|
|
|
|
2017-03-03 07:56:35 +00:00
|
|
|
@python_2_unicode_compatible
|
2016-10-04 22:00:03 +00:00
|
|
|
class Find(models.Model):
|
|
|
|
|
|
|
|
class Meta:
|
|
|
|
unique_together = ('document', 'key')
|
|
|
|
|
|
|
|
document = models.ForeignKey('Document', related_name='find', db_index=True)
|
|
|
|
key = models.CharField(max_length=200, db_index=True)
|
|
|
|
value = models.TextField(blank=True, db_index=settings.DB_GIN_TRGM)
|
|
|
|
|
2017-03-03 07:56:35 +00:00
|
|
|
def __str__(self):
|
2016-10-04 22:00:03 +00:00
|
|
|
return u'%s=%s' % (self.key, self.value)
|
|
|
|
|
|
|
|
'''
|
|
|
|
Sort
|
|
|
|
table constructed based on info in settings.CONFIG['documentKeys']
|
|
|
|
'''
|
|
|
|
attrs = {
|
|
|
|
'__module__': 'document.models',
|
|
|
|
'document': models.OneToOneField('Document', related_name='sort', primary_key=True),
|
|
|
|
'created': models.DateTimeField(null=True, blank=True, db_index=True),
|
|
|
|
}
|
2017-02-16 13:24:51 +00:00
|
|
|
for key in list(filter(lambda k: k.get('sort', False) or k['type'] in ('integer', 'time', 'float', 'date', 'enum'), settings.CONFIG['documentKeys'])):
|
2016-10-04 22:00:03 +00:00
|
|
|
name = key['id']
|
|
|
|
sort_type = key.get('sortType', key['type'])
|
|
|
|
if isinstance(sort_type, list):
|
|
|
|
sort_type = sort_type[0]
|
|
|
|
field = get_sort_field(sort_type)
|
|
|
|
if name not in attrs:
|
|
|
|
attrs[name] = field[0](**field[1])
|
|
|
|
|
|
|
|
Sort = type('Sort', (models.Model,), attrs)
|
|
|
|
Sort.fields = [f.name for f in Sort._meta.fields]
|