store document references in db

This commit is contained in:
j 2017-01-26 15:56:28 +00:00
parent df60dca85f
commit b23ca9df74
5 changed files with 99 additions and 32 deletions

View file

@ -183,6 +183,7 @@ class Annotation(models.Model):
# update matches in bulk if called from load_subtitles # update matches in bulk if called from load_subtitles
if not delay_matches: if not delay_matches:
self.update_matches() self.update_matches()
self.update_documents()
def update_matches(self): def update_matches(self):
from place.models import Place from place.models import Place
@ -247,6 +248,20 @@ class Annotation(models.Model):
for e in a_matches.all(): for e in a_matches.all():
e.update_matches(Annotation.objects.filter(pk=self.id)) e.update_matches(Annotation.objects.filter(pk=self.id))
def update_documents(self):
from document.models import Document
from document.utils import get_documents
old = [d.id for id in self.documents.all()]
current = get_documents(self.value) if self.value else []
removed = list(set(old) - set(current))
added = list(set(current) - set(old))
if removed:
for document in Document.objects.filter(id__in=removed):
self.documents.remove(document)
if added:
for document in Document.objects.filter(id__in=added):
self.documents.add(document)
def delete(self, *args, **kwargs): def delete(self, *args, **kwargs):
with transaction.atomic(): with transaction.atomic():
super(Annotation, self).delete(*args, **kwargs) super(Annotation, self).delete(*args, **kwargs)

View file

@ -27,3 +27,11 @@ class Command(BaseCommand):
i.save() i.save()
except: except:
pass pass
import annotation.models
ids = [i['id'] for i in annotation.models.Annotation.objects.all().values('id')]
for id in ids:
try:
a = annotation.models.Annotation.objects.get(id=id)
a.update_documents()
except:
pass

View file

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9.4 on 2017-01-26 15:28
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('annotation', '0003_auto_20160219_1537'),
('document', '0009_add_group'),
]
operations = [
migrations.AddField(
model_name='document',
name='annotations',
field=models.ManyToManyField(related_name='documents', to='annotation.Annotation'),
),
migrations.AddField(
model_name='document',
name='linked_documents',
field=models.ManyToManyField(related_name='linking_documents', to='document.Document'),
),
migrations.AlterField(
model_name='document',
name='ratio',
field=models.FloatField(default=-1),
),
]

View file

@ -23,6 +23,7 @@ from oxdjango import fields
from oxdjango.sortmodel import get_sort_field from oxdjango.sortmodel import get_sort_field
from person.models import get_name_sort from person.models import get_name_sort
from item.models import Item from item.models import Item
from annotation.models import Annotation
from archive.extract import resize_image from archive.extract import resize_image
from archive.chunk import save_chunk from archive.chunk import save_chunk
@ -59,6 +60,8 @@ class Document(models.Model):
uploading = models.BooleanField(default=False) uploading = models.BooleanField(default=False)
items = models.ManyToManyField(Item, through='ItemProperties', related_name='documents') items = models.ManyToManyField(Item, through='ItemProperties', related_name='documents')
annotations = models.ManyToManyField(Annotation, related_name='documents')
linked_documents = models.ManyToManyField('Document', related_name='linking_documents')
rightslevel = models.IntegerField(db_index=True, default=0) rightslevel = models.IntegerField(db_index=True, default=0)
data = fields.DictField(default={}) data = fields.DictField(default={})
@ -195,10 +198,7 @@ class Document(models.Model):
setattr(s, name, value) setattr(s, name, value)
def get_value(source, key): def get_value(source, key):
if 'value' in key and 'layer' in key['value']: value = self.get_value(source)
value = [a.value for a in self.annotations.filter(layer=key['value']['layer']).exclude(value='')]
else:
value = self.get_value(source)
return value return value
def get_words(source, key): def get_words(source, key):
@ -242,10 +242,7 @@ class Document(models.Model):
set_value(s, name, value) set_value(s, name, value)
elif sort_type in ('length', 'integer', 'time', 'float'): elif sort_type in ('length', 'integer', 'time', 'float'):
# can be length of strings or length of arrays, i.e. keywords # can be length of strings or length of arrays, i.e. keywords
if 'layer' in key.get('value', []): value = self.get_value(source)
value = self.annotations.filter(layer=key['value']['layer']).count()
else:
value = self.get_value(source)
if isinstance(value, list): if isinstance(value, list):
value = len(value) value = len(value)
set_value(s, name, value) set_value(s, name, value)
@ -284,6 +281,7 @@ class Document(models.Model):
self.update_find() self.update_find()
self.update_facets() self.update_facets()
self.update_matches() self.update_matches()
self.update_linked_documents()
def __unicode__(self): def __unicode__(self):
return self.get_id() return self.get_id()
@ -580,35 +578,44 @@ class Document(models.Model):
return urls return urls
def referenced(self): def referenced(self):
import annotation.models
import item.models
result = {} result = {}
result['items'] = [i.get_json(keys=['id', 'title']) for i in self.items.all().order_by('sort__title')] result['items'] = [
urls = self.urls() i.get_json(keys=['id', 'title'])
# annotations for i in self.items.all().order_by('sort__title')
q = Q() ]
for url in urls: result['annotations'] = [
q |= Q(value__contains=url) a.json(keys=['id', 'title', 'in'])
qs = annotation.models.Annotation.objects.filter(q) for a in self.annotations.all().order_by('start', 'end')
result['annotations'] = [a.json(keys=['id', 'title', 'in']) for a in qs] ]
# documents result['documents'] = [
q = Q() d.json(keys=['id', 'title'])
for url in urls: for d in self.linking_documents.all().order_by('sort__title')
q |= Q(data__contains=url) ]
qs = Document.objects.filter(q) result['entities'] = [
result['documents'] = [d.json(keys=['id', 'title']) for d in qs] e.json(keys=['id', 'name'])
for e in self.entities.all()
result['entities'] = [e.json(keys=['id', 'name']) for e in self.entities.all()] ]
return result return result
def update_linked_documents(self):
if self.extension == 'html':
old = [d.id for id in self.linked_documents.all()]
current = utils.get_documents(self.data['text'])
removed = list(set(old) - set(current))
added = list(set(current) - set(old))
if removed:
for document in Document.objects.filter(id__in=removed):
self.linked_documents.remove(document)
if added:
for document in Document.objects.filter(id__in=added):
self.linked_documents.add(document)
def update_matches(self): def update_matches(self):
import annotation.models
import item.models
urls = self.urls() urls = self.urls()
matches = self.items.count() + self.entities.count() matches = self.items.count() + self.entities.count()
for url in urls: for url in urls:
matches += annotation.models.Annotation.objects.filter(value__contains=url).count() matches += Annotation.objects.filter(value__contains=url).count()
matches += item.models.Item.objects.filter(data__contains=url).count() matches += Item.objects.filter(data__contains=url).count()
matches += Document.objects.filter(extension='html', data__contains=url).count() matches += Document.objects.filter(extension='html', data__contains=url).count()
if matches != self.matches: if matches != self.matches:
Document.objects.filter(id=self.id).update(matches=matches) Document.objects.filter(id=self.id).update(matches=matches)

View file

@ -1,9 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re
import subprocess import subprocess
import ox
from item.utils import sort_title, sort_string, get_by_id from item.utils import sort_title, sort_string, get_by_id
def pdfpages(pdf): def pdfpages(pdf):
@ -28,3 +29,8 @@ def extract_pdfpage(pdf, image, page):
p = subprocess.Popen(cmd, close_fds=True) p = subprocess.Popen(cmd, close_fds=True)
p.wait() p.wait()
return image return image
def get_documents(text):
ids = re.compile('/documents/([A-Z]+)').findall(text)
ids += re.compile('/document/([A-Z]+)').findall(text)
return [ox.fromAZ(id) for id in set(ids)]