From b23ca9df74fdec0a56d43b2b53e0d107e6a606a0 Mon Sep 17 00:00:00 2001 From: j Date: Thu, 26 Jan 2017 15:56:28 +0000 Subject: [PATCH] store document references in db --- pandora/annotation/models.py | 15 +++++ .../commands/rebuild_documentfind.py | 8 +++ .../migrations/0010_auto_20170126_1528.py | 31 +++++++++ pandora/document/models.py | 67 ++++++++++--------- pandora/document/utils.py | 10 ++- 5 files changed, 99 insertions(+), 32 deletions(-) create mode 100644 pandora/document/migrations/0010_auto_20170126_1528.py diff --git a/pandora/annotation/models.py b/pandora/annotation/models.py index dfd9a188..2df53b63 100644 --- a/pandora/annotation/models.py +++ b/pandora/annotation/models.py @@ -183,6 +183,7 @@ class Annotation(models.Model): # update matches in bulk if called from load_subtitles if not delay_matches: self.update_matches() + self.update_documents() def update_matches(self): from place.models import Place @@ -247,6 +248,20 @@ class Annotation(models.Model): for e in a_matches.all(): e.update_matches(Annotation.objects.filter(pk=self.id)) + def update_documents(self): + from document.models import Document + from document.utils import get_documents + old = [d.id for id in self.documents.all()] + current = get_documents(self.value) if self.value else [] + removed = list(set(old) - set(current)) + added = list(set(current) - set(old)) + if removed: + for document in Document.objects.filter(id__in=removed): + self.documents.remove(document) + if added: + for document in Document.objects.filter(id__in=added): + self.documents.add(document) + def delete(self, *args, **kwargs): with transaction.atomic(): super(Annotation, self).delete(*args, **kwargs) diff --git a/pandora/document/management/commands/rebuild_documentfind.py b/pandora/document/management/commands/rebuild_documentfind.py index 05af9d84..d297a566 100644 --- a/pandora/document/management/commands/rebuild_documentfind.py +++ b/pandora/document/management/commands/rebuild_documentfind.py @@ -27,3 +27,11 @@ class Command(BaseCommand): i.save() except: pass + import annotation.models + ids = [i['id'] for i in annotation.models.Annotation.objects.all().values('id')] + for id in ids: + try: + a = annotation.models.Annotation.objects.get(id=id) + a.update_documents() + except: + pass diff --git a/pandora/document/migrations/0010_auto_20170126_1528.py b/pandora/document/migrations/0010_auto_20170126_1528.py new file mode 100644 index 00000000..fd68be50 --- /dev/null +++ b/pandora/document/migrations/0010_auto_20170126_1528.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.4 on 2017-01-26 15:28 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('annotation', '0003_auto_20160219_1537'), + ('document', '0009_add_group'), + ] + + operations = [ + migrations.AddField( + model_name='document', + name='annotations', + field=models.ManyToManyField(related_name='documents', to='annotation.Annotation'), + ), + migrations.AddField( + model_name='document', + name='linked_documents', + field=models.ManyToManyField(related_name='linking_documents', to='document.Document'), + ), + migrations.AlterField( + model_name='document', + name='ratio', + field=models.FloatField(default=-1), + ), + ] diff --git a/pandora/document/models.py b/pandora/document/models.py index c7ded012..ef2e491e 100644 --- a/pandora/document/models.py +++ b/pandora/document/models.py @@ -23,6 +23,7 @@ from oxdjango import fields from oxdjango.sortmodel import get_sort_field from person.models import get_name_sort from item.models import Item +from annotation.models import Annotation from archive.extract import resize_image from archive.chunk import save_chunk @@ -59,6 +60,8 @@ class Document(models.Model): uploading = models.BooleanField(default=False) items = models.ManyToManyField(Item, through='ItemProperties', related_name='documents') + annotations = models.ManyToManyField(Annotation, related_name='documents') + linked_documents = models.ManyToManyField('Document', related_name='linking_documents') rightslevel = models.IntegerField(db_index=True, default=0) data = fields.DictField(default={}) @@ -195,10 +198,7 @@ class Document(models.Model): setattr(s, name, value) def get_value(source, key): - if 'value' in key and 'layer' in key['value']: - value = [a.value for a in self.annotations.filter(layer=key['value']['layer']).exclude(value='')] - else: - value = self.get_value(source) + value = self.get_value(source) return value def get_words(source, key): @@ -242,10 +242,7 @@ class Document(models.Model): set_value(s, name, value) elif sort_type in ('length', 'integer', 'time', 'float'): # can be length of strings or length of arrays, i.e. keywords - if 'layer' in key.get('value', []): - value = self.annotations.filter(layer=key['value']['layer']).count() - else: - value = self.get_value(source) + value = self.get_value(source) if isinstance(value, list): value = len(value) set_value(s, name, value) @@ -284,6 +281,7 @@ class Document(models.Model): self.update_find() self.update_facets() self.update_matches() + self.update_linked_documents() def __unicode__(self): return self.get_id() @@ -580,35 +578,44 @@ class Document(models.Model): return urls def referenced(self): - import annotation.models - import item.models result = {} - result['items'] = [i.get_json(keys=['id', 'title']) for i in self.items.all().order_by('sort__title')] - urls = self.urls() - # annotations - q = Q() - for url in urls: - q |= Q(value__contains=url) - qs = annotation.models.Annotation.objects.filter(q) - result['annotations'] = [a.json(keys=['id', 'title', 'in']) for a in qs] - # documents - q = Q() - for url in urls: - q |= Q(data__contains=url) - qs = Document.objects.filter(q) - result['documents'] = [d.json(keys=['id', 'title']) for d in qs] - - result['entities'] = [e.json(keys=['id', 'name']) for e in self.entities.all()] + result['items'] = [ + i.get_json(keys=['id', 'title']) + for i in self.items.all().order_by('sort__title') + ] + result['annotations'] = [ + a.json(keys=['id', 'title', 'in']) + for a in self.annotations.all().order_by('start', 'end') + ] + result['documents'] = [ + d.json(keys=['id', 'title']) + for d in self.linking_documents.all().order_by('sort__title') + ] + result['entities'] = [ + e.json(keys=['id', 'name']) + for e in self.entities.all() + ] return result + def update_linked_documents(self): + if self.extension == 'html': + old = [d.id for id in self.linked_documents.all()] + current = utils.get_documents(self.data['text']) + removed = list(set(old) - set(current)) + added = list(set(current) - set(old)) + if removed: + for document in Document.objects.filter(id__in=removed): + self.linked_documents.remove(document) + if added: + for document in Document.objects.filter(id__in=added): + self.linked_documents.add(document) + def update_matches(self): - import annotation.models - import item.models urls = self.urls() matches = self.items.count() + self.entities.count() for url in urls: - matches += annotation.models.Annotation.objects.filter(value__contains=url).count() - matches += item.models.Item.objects.filter(data__contains=url).count() + matches += Annotation.objects.filter(value__contains=url).count() + matches += Item.objects.filter(data__contains=url).count() matches += Document.objects.filter(extension='html', data__contains=url).count() if matches != self.matches: Document.objects.filter(id=self.id).update(matches=matches) diff --git a/pandora/document/utils.py b/pandora/document/utils.py index a33afcda..70eb3da4 100644 --- a/pandora/document/utils.py +++ b/pandora/document/utils.py @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 - - +import re import subprocess +import ox + from item.utils import sort_title, sort_string, get_by_id def pdfpages(pdf): @@ -28,3 +29,8 @@ def extract_pdfpage(pdf, image, page): p = subprocess.Popen(cmd, close_fds=True) p.wait() return image + +def get_documents(text): + ids = re.compile('/documents/([A-Z]+)').findall(text) + ids += re.compile('/document/([A-Z]+)').findall(text) + return [ox.fromAZ(id) for id in set(ids)]