forked from 0x2620/pandora
store document references in db
This commit is contained in:
parent
df60dca85f
commit
b23ca9df74
5 changed files with 99 additions and 32 deletions
|
@ -183,6 +183,7 @@ class Annotation(models.Model):
|
||||||
# update matches in bulk if called from load_subtitles
|
# update matches in bulk if called from load_subtitles
|
||||||
if not delay_matches:
|
if not delay_matches:
|
||||||
self.update_matches()
|
self.update_matches()
|
||||||
|
self.update_documents()
|
||||||
|
|
||||||
def update_matches(self):
|
def update_matches(self):
|
||||||
from place.models import Place
|
from place.models import Place
|
||||||
|
@ -247,6 +248,20 @@ class Annotation(models.Model):
|
||||||
for e in a_matches.all():
|
for e in a_matches.all():
|
||||||
e.update_matches(Annotation.objects.filter(pk=self.id))
|
e.update_matches(Annotation.objects.filter(pk=self.id))
|
||||||
|
|
||||||
|
def update_documents(self):
|
||||||
|
from document.models import Document
|
||||||
|
from document.utils import get_documents
|
||||||
|
old = [d.id for id in self.documents.all()]
|
||||||
|
current = get_documents(self.value) if self.value else []
|
||||||
|
removed = list(set(old) - set(current))
|
||||||
|
added = list(set(current) - set(old))
|
||||||
|
if removed:
|
||||||
|
for document in Document.objects.filter(id__in=removed):
|
||||||
|
self.documents.remove(document)
|
||||||
|
if added:
|
||||||
|
for document in Document.objects.filter(id__in=added):
|
||||||
|
self.documents.add(document)
|
||||||
|
|
||||||
def delete(self, *args, **kwargs):
|
def delete(self, *args, **kwargs):
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
super(Annotation, self).delete(*args, **kwargs)
|
super(Annotation, self).delete(*args, **kwargs)
|
||||||
|
|
|
@ -27,3 +27,11 @@ class Command(BaseCommand):
|
||||||
i.save()
|
i.save()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
import annotation.models
|
||||||
|
ids = [i['id'] for i in annotation.models.Annotation.objects.all().values('id')]
|
||||||
|
for id in ids:
|
||||||
|
try:
|
||||||
|
a = annotation.models.Annotation.objects.get(id=id)
|
||||||
|
a.update_documents()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
31
pandora/document/migrations/0010_auto_20170126_1528.py
Normal file
31
pandora/document/migrations/0010_auto_20170126_1528.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Generated by Django 1.9.4 on 2017-01-26 15:28
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('annotation', '0003_auto_20160219_1537'),
|
||||||
|
('document', '0009_add_group'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='document',
|
||||||
|
name='annotations',
|
||||||
|
field=models.ManyToManyField(related_name='documents', to='annotation.Annotation'),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='document',
|
||||||
|
name='linked_documents',
|
||||||
|
field=models.ManyToManyField(related_name='linking_documents', to='document.Document'),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='document',
|
||||||
|
name='ratio',
|
||||||
|
field=models.FloatField(default=-1),
|
||||||
|
),
|
||||||
|
]
|
|
@ -23,6 +23,7 @@ from oxdjango import fields
|
||||||
from oxdjango.sortmodel import get_sort_field
|
from oxdjango.sortmodel import get_sort_field
|
||||||
from person.models import get_name_sort
|
from person.models import get_name_sort
|
||||||
from item.models import Item
|
from item.models import Item
|
||||||
|
from annotation.models import Annotation
|
||||||
from archive.extract import resize_image
|
from archive.extract import resize_image
|
||||||
from archive.chunk import save_chunk
|
from archive.chunk import save_chunk
|
||||||
|
|
||||||
|
@ -59,6 +60,8 @@ class Document(models.Model):
|
||||||
uploading = models.BooleanField(default=False)
|
uploading = models.BooleanField(default=False)
|
||||||
|
|
||||||
items = models.ManyToManyField(Item, through='ItemProperties', related_name='documents')
|
items = models.ManyToManyField(Item, through='ItemProperties', related_name='documents')
|
||||||
|
annotations = models.ManyToManyField(Annotation, related_name='documents')
|
||||||
|
linked_documents = models.ManyToManyField('Document', related_name='linking_documents')
|
||||||
|
|
||||||
rightslevel = models.IntegerField(db_index=True, default=0)
|
rightslevel = models.IntegerField(db_index=True, default=0)
|
||||||
data = fields.DictField(default={})
|
data = fields.DictField(default={})
|
||||||
|
@ -195,10 +198,7 @@ class Document(models.Model):
|
||||||
setattr(s, name, value)
|
setattr(s, name, value)
|
||||||
|
|
||||||
def get_value(source, key):
|
def get_value(source, key):
|
||||||
if 'value' in key and 'layer' in key['value']:
|
value = self.get_value(source)
|
||||||
value = [a.value for a in self.annotations.filter(layer=key['value']['layer']).exclude(value='')]
|
|
||||||
else:
|
|
||||||
value = self.get_value(source)
|
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def get_words(source, key):
|
def get_words(source, key):
|
||||||
|
@ -242,10 +242,7 @@ class Document(models.Model):
|
||||||
set_value(s, name, value)
|
set_value(s, name, value)
|
||||||
elif sort_type in ('length', 'integer', 'time', 'float'):
|
elif sort_type in ('length', 'integer', 'time', 'float'):
|
||||||
# can be length of strings or length of arrays, i.e. keywords
|
# can be length of strings or length of arrays, i.e. keywords
|
||||||
if 'layer' in key.get('value', []):
|
value = self.get_value(source)
|
||||||
value = self.annotations.filter(layer=key['value']['layer']).count()
|
|
||||||
else:
|
|
||||||
value = self.get_value(source)
|
|
||||||
if isinstance(value, list):
|
if isinstance(value, list):
|
||||||
value = len(value)
|
value = len(value)
|
||||||
set_value(s, name, value)
|
set_value(s, name, value)
|
||||||
|
@ -284,6 +281,7 @@ class Document(models.Model):
|
||||||
self.update_find()
|
self.update_find()
|
||||||
self.update_facets()
|
self.update_facets()
|
||||||
self.update_matches()
|
self.update_matches()
|
||||||
|
self.update_linked_documents()
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return self.get_id()
|
return self.get_id()
|
||||||
|
@ -580,35 +578,44 @@ class Document(models.Model):
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
def referenced(self):
|
def referenced(self):
|
||||||
import annotation.models
|
|
||||||
import item.models
|
|
||||||
result = {}
|
result = {}
|
||||||
result['items'] = [i.get_json(keys=['id', 'title']) for i in self.items.all().order_by('sort__title')]
|
result['items'] = [
|
||||||
urls = self.urls()
|
i.get_json(keys=['id', 'title'])
|
||||||
# annotations
|
for i in self.items.all().order_by('sort__title')
|
||||||
q = Q()
|
]
|
||||||
for url in urls:
|
result['annotations'] = [
|
||||||
q |= Q(value__contains=url)
|
a.json(keys=['id', 'title', 'in'])
|
||||||
qs = annotation.models.Annotation.objects.filter(q)
|
for a in self.annotations.all().order_by('start', 'end')
|
||||||
result['annotations'] = [a.json(keys=['id', 'title', 'in']) for a in qs]
|
]
|
||||||
# documents
|
result['documents'] = [
|
||||||
q = Q()
|
d.json(keys=['id', 'title'])
|
||||||
for url in urls:
|
for d in self.linking_documents.all().order_by('sort__title')
|
||||||
q |= Q(data__contains=url)
|
]
|
||||||
qs = Document.objects.filter(q)
|
result['entities'] = [
|
||||||
result['documents'] = [d.json(keys=['id', 'title']) for d in qs]
|
e.json(keys=['id', 'name'])
|
||||||
|
for e in self.entities.all()
|
||||||
result['entities'] = [e.json(keys=['id', 'name']) for e in self.entities.all()]
|
]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def update_linked_documents(self):
|
||||||
|
if self.extension == 'html':
|
||||||
|
old = [d.id for id in self.linked_documents.all()]
|
||||||
|
current = utils.get_documents(self.data['text'])
|
||||||
|
removed = list(set(old) - set(current))
|
||||||
|
added = list(set(current) - set(old))
|
||||||
|
if removed:
|
||||||
|
for document in Document.objects.filter(id__in=removed):
|
||||||
|
self.linked_documents.remove(document)
|
||||||
|
if added:
|
||||||
|
for document in Document.objects.filter(id__in=added):
|
||||||
|
self.linked_documents.add(document)
|
||||||
|
|
||||||
def update_matches(self):
|
def update_matches(self):
|
||||||
import annotation.models
|
|
||||||
import item.models
|
|
||||||
urls = self.urls()
|
urls = self.urls()
|
||||||
matches = self.items.count() + self.entities.count()
|
matches = self.items.count() + self.entities.count()
|
||||||
for url in urls:
|
for url in urls:
|
||||||
matches += annotation.models.Annotation.objects.filter(value__contains=url).count()
|
matches += Annotation.objects.filter(value__contains=url).count()
|
||||||
matches += item.models.Item.objects.filter(data__contains=url).count()
|
matches += Item.objects.filter(data__contains=url).count()
|
||||||
matches += Document.objects.filter(extension='html', data__contains=url).count()
|
matches += Document.objects.filter(extension='html', data__contains=url).count()
|
||||||
if matches != self.matches:
|
if matches != self.matches:
|
||||||
Document.objects.filter(id=self.id).update(matches=matches)
|
Document.objects.filter(id=self.id).update(matches=matches)
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
import ox
|
||||||
|
|
||||||
from item.utils import sort_title, sort_string, get_by_id
|
from item.utils import sort_title, sort_string, get_by_id
|
||||||
|
|
||||||
def pdfpages(pdf):
|
def pdfpages(pdf):
|
||||||
|
@ -28,3 +29,8 @@ def extract_pdfpage(pdf, image, page):
|
||||||
p = subprocess.Popen(cmd, close_fds=True)
|
p = subprocess.Popen(cmd, close_fds=True)
|
||||||
p.wait()
|
p.wait()
|
||||||
return image
|
return image
|
||||||
|
|
||||||
|
def get_documents(text):
|
||||||
|
ids = re.compile('/documents/([A-Z]+)').findall(text)
|
||||||
|
ids += re.compile('/document/([A-Z]+)').findall(text)
|
||||||
|
return [ox.fromAZ(id) for id in set(ids)]
|
||||||
|
|
Loading…
Reference in a new issue