fulltext search in pages

This commit is contained in:
j 2021-11-15 15:20:08 +00:00
commit 950bec609d
13 changed files with 608 additions and 22 deletions

View file

@ -1,14 +1,31 @@
import subprocess
import tempfile
from django.conf import settings
def extract_text(pdf):
cmd = ['pdftotext', pdf, '-']
def extract_text(pdf, page=None):
if page is not None:
page = str(page)
cmd = ['pdftotext', '-f', page, '-l', page, pdf, '-']
else:
cmd = ['pdftotext', pdf, '-']
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
stdout = stdout.decode()
return stdout.strip()
stdout = stdout.decode().strip()
if not stdout:
if page:
# split page from pdf and ocr
fd, page_pdf = tempfile.mkstemp('.pdf')
cmd = ['pdfseparate', '-f', page, '-l', page, pdf, page_pdf]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
text = ocr_image(page_pdf)
os.unlink(page_pdf)
return text
else:
return ocr_image(pdf)
return stdout
def ocr_image(path):
cmd = ['tesseract', path, '-', 'txt']
@ -19,6 +36,7 @@ def ocr_image(path):
class FulltextMixin:
_ES_INDEX = "document-index"
_ES_DOC_TYPE = "document"
@classmethod
def elasticsearch(cls):
@ -43,7 +61,7 @@ class FulltextMixin:
if self.has_fulltext_key():
from elasticsearch.exceptions import NotFoundError
try:
res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type='document', id=self.id)
res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type=self._ES_DOC_TYPE, id=self.id)
except NotFoundError:
pass
@ -54,7 +72,7 @@ class FulltextMixin:
doc = {
'text': text.lower()
}
res = self.elasticsearch().index(index=self._ES_INDEX, doc_type='document', id=self.id, body=doc)
res = self.elasticsearch().index(index=self._ES_INDEX, doc_type=self._ES_DOC_TYPE, id=self.id, body=doc)
@classmethod
def find_fulltext(cls, query):
@ -95,3 +113,19 @@ class FulltextMixin:
ids += [int(r['_id']) for r in res['hits']['hits']]
from_ += len(res['hits']['hits'])
return ids
class FulltextPageMixin(FulltextMixin):
_ES_INDEX = "document-page-index"
_DOC_TYPE = 'page'
def extract_fulltext(self):
if self.document.file:
if self.document.extension == 'pdf':
return extract_text(self.document.file.path, self.page)
elif self.extension in ('png', 'jpg'):
return ocr_image(self.document.file.path)
elif self.extension == 'html':
# FIXME: is there a nice way to split that into pages
return self.data.get('text', '')
return ''

View file

@ -14,6 +14,7 @@ from documentcollection.models import Collection
from item import utils
from user.models import Group
from .pages import PageManager
keymap = {
'item': 'items__public_id',
@ -61,7 +62,7 @@ def parseCondition(condition, user, item=None, owner=None):
def buildCondition(k, op, v, user, exclude=False, owner=None):
import entity.models
from . import models
from .. import models
# fixme: frontend should never call with list
if k == 'list':
@ -299,3 +300,4 @@ class DocumentManager(Manager):
qs = qs.filter(q)
return qs

View file

@ -0,0 +1,304 @@
# -*- coding: utf-8 -*-
import unicodedata
from six import string_types
from django.db.models import Q, Manager
from django.conf import settings
import ox
from oxdjango.query import QuerySet
import entity.managers
from oxdjango.managers import get_operator
from documentcollection.models import Collection
from item import utils
from user.models import Group
keymap = {
'item': 'items__public_id',
}
default_key = 'title'
def get_key_type(k):
key_type = (utils.get_by_id(settings.CONFIG['documentKeys'], k) or {'type': 'string'}).get('type')
if isinstance(key_type, list):
key_type = key_type[0]
key_type = {
'title': 'string',
'person': 'string',
'text': 'string',
'year': 'string',
'length': 'string',
'layer': 'string',
'list': 'list',
}.get(key_type, key_type)
return key_type
def parseCondition(condition, user, item=None, owner=None):
'''
'''
k = condition.get('key', default_key)
k = keymap.get(k, k)
if not k:
k = default_key
if item and k == 'description':
item_conditions = condition.copy()
item_conditions['key'] = 'items__itemproperties__description'
return parseCondition(condition, user) | parseCondition(item_conditions, user)
v = condition['value']
op = condition.get('operator')
if not op:
op = '='
print(k, op, v)
if op.startswith('!'):
return buildCondition(k, op[1:], v, user, True, owner=owner)
else:
return buildCondition(k, op, v, user, owner=owner)
def buildCondition(k, op, v, user, exclude=False, owner=None):
import entity.models
from .. import models
# fixme: frontend should never call with list
if k == 'list':
print('fixme: frontend should never call with list', k, op, v)
k = 'collection'
key_type = get_key_type(k)
key_config = (utils.get_by_id(settings.CONFIG['documentKeys'], k) or {'type': 'string'})
facet_keys = models.Document.facet_keys
if k == 'document':
k = 'document__id'
if op == '&' and isinstance(v, list):
v = [ox.fromAZ(id_) for id_ in v]
k += get_operator(op)
else:
v = ox.fromAZ(v)
q = Q(**{k: v})
if exclude:
q = ~Q(document__id__in=models.Document.objects.filter(q))
return q
elif k == 'rightslevel':
q = Q(document__rightslevel=v)
if exclude:
q = ~Q(document__rightslevel=v)
return q
elif k == 'groups':
if op == '==' and v == '$my':
if not owner:
owner = user
groups = owner.groups.all()
else:
key = 'name' + get_operator(op)
groups = Group.objects.filter(**{key: v})
if not groups.count():
return Q(id=0)
q = Q(document__groups__in=groups)
if exclude:
q = ~q
return q
elif k in ('oshash', 'items__public_id'):
q = Q(**{k: v})
if exclude:
q = ~Q(id__in=models.Document.objects.filter(q))
return q
elif isinstance(v, bool):
key = k
elif k == 'entity':
entity_key, entity_v = entity.managers.namePredicate(op, v)
key = 'id__in'
v = entity.models.DocumentProperties.objects.filter(**{
'entity__' + entity_key: entity_v
}).values_list('document_id', flat=True)
elif k == 'collection':
q = Q(id=0)
l = v.split(":", 1)
if len(l) >= 2:
lqs = list(Collection.objects.filter(name=l[1], user__username=l[0]))
if len(lqs) == 1 and lqs[0].accessible(user):
l = lqs[0]
if l.query.get('static', False) is False:
data = l.query
q = parseConditions(data.get('conditions', []),
data.get('operator', '&'),
user, owner=l.user)
else:
q = Q(id__in=l.documents.all())
else:
q = Q(id=0)
return q
elif key_config.get('fulltext'):
print('fulltext?')
qs = models.Page.find_fulltext_ids(v)
q = Q(id__in=qs)
if exclude:
q = ~Q(id__in=qs)
return q
elif key_type == 'boolean':
q = Q(**{'find__key': k, 'find__value': v})
if exclude:
q = ~Q(id__in=models.Document.objects.filter(q))
return q
elif key_type == "string":
in_find = True
if in_find:
value_key = 'find__value'
else:
value_key = k
if isinstance(v, string_types):
v = unicodedata.normalize('NFKD', v).lower()
if k in facet_keys:
in_find = False
facet_value = 'facets__value' + get_operator(op, 'istr')
v = models.Document.objects.filter(**{'facets__key': k, facet_value: v})
value_key = 'id__in'
else:
value_key = value_key + get_operator(op)
k = str(k)
value_key = str(value_key)
if k == '*':
q = Q(**{'find__value' + get_operator(op): v}) | \
Q(**{'facets__value' + get_operator(op, 'istr'): v})
elif in_find:
q = Q(**{'find__key': k, value_key: v})
else:
q = Q(**{value_key: v})
if exclude:
q = ~Q(id__in=models.Document.objects.filter(q))
return q
elif key_type == 'date':
def parse_date(d):
while len(d) < 3:
d.append(1)
return datetime(*[int(i) for i in d])
#using sort here since find only contains strings
v = parse_date(v.split('-'))
vk = 'sort__%s%s' % (k, get_operator(op, 'int'))
vk = str(vk)
q = Q(**{vk: v})
if exclude:
q = ~q
return q
else: # integer, float, list, time
#use sort table here
if key_type == 'time':
v = int(utils.parse_time(v))
vk = 'sort__%s%s' % (k, get_operator(op, 'int'))
vk = str(vk)
q = Q(**{vk: v})
if exclude:
q = ~q
return q
key = str(key)
q = Q(**{key: v})
if exclude:
q = ~q
return q
def parseConditions(conditions, operator, user, item=None, owner=None):
'''
conditions: [
{
value: "war"
}
{
key: "year",
value: "1970-1980,
operator: "!="
},
{
key: "country",
value: "f",
operator: "^"
}
],
operator: "&"
'''
conn = []
for condition in conditions:
if 'conditions' in condition:
q = parseConditions(condition['conditions'],
condition.get('operator', '&'), user, item, owner=owner)
if q:
conn.append(q)
pass
else:
conn.append(parseCondition(condition, user, item, owner=owner))
if conn:
q = conn[0]
for c in conn[1:]:
if operator == '|':
q = q | c
else:
q = q & c
return q
return None
class PageManager(Manager):
def get_query_set(self):
return QuerySet(self.model)
def find(self, data, user, item=None):
'''
query: {
conditions: [
{
value: "war"
}
{
key: "year",
value: "1970-1980,
operator: "!="
},
{
key: "country",
value: "f",
operator: "^"
}
],
operator: "&"
}
'''
#join query with operator
qs = self.get_query_set()
query = data.get('query', {})
conditions = parseConditions(query.get('conditions', []),
query.get('operator', '&'),
user, item)
if conditions:
qs = qs.filter(conditions)
qs = qs.distinct()
#anonymous can only see public items
if not user or user.is_anonymous:
level = 'guest'
allowed_level = settings.CONFIG['capabilities']['canSeeDocument'][level]
qs = qs.filter(document__rightslevel__lte=allowed_level)
rendered_q = Q(rendered=True)
#users can see public items, there own items and items of there groups
else:
level = user.profile.get_level()
allowed_level = settings.CONFIG['capabilities']['canSeeDocument'][level]
q = Q(document__rightslevel__lte=allowed_level) | Q(document__user=user)
rendered_q = Q(rendered=True) | Q(document__user=user)
if user.groups.count():
q |= Q(document__groups__in=user.groups.all())
rendered_q |= Q(document__groups__in=user.groups.all())
qs = qs.filter(q)
return qs

View file

@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.22 on 2020-05-13 00:01
from __future__ import unicode_literals
import django.core.serializers.json
from django.db import migrations, models
import django.db.models.deletion
import document.fulltext
import oxdjango.fields
class Migration(migrations.Migration):
dependencies = [
('document', '0011_jsonfield'),
]
operations = [
migrations.CreateModel(
name='Page',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
('page', models.IntegerField(default=1)),
('data', oxdjango.fields.JSONField(default=dict, editable=False, encoder=django.core.serializers.json.DjangoJSONEncoder)),
],
bases=(models.Model, document.fulltext.FulltextPageMixin),
),
migrations.AddField(
model_name='page',
name='document',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='pages_set', to='document.Document'),
),
]

View file

@ -29,7 +29,7 @@ from user.utils import update_groups
from . import managers
from . import utils
from . import tasks
from .fulltext import FulltextMixin
from .fulltext import FulltextMixin, FulltextPageMixin
User = get_user_model()
@ -586,6 +586,11 @@ class Document(models.Model, FulltextMixin):
image = os.path.join(os.path.dirname(pdf), '1024p%d.jpg' % page)
utils.extract_pdfpage(pdf, image, page)
def create_pages(self):
for page in range(self.pages):
page += 1
p, c = Page.objects.get_or_create(document=self, page=page)
def get_info(self):
if self.extension == 'pdf':
self.thumbnail(1024)
@ -702,6 +707,41 @@ class ItemProperties(models.Model):
super(ItemProperties, self).save(*args, **kwargs)
class Page(models.Model, FulltextPageMixin):
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
document = models.ForeignKey(Document, related_name='pages_set', on_delete=models.CASCADE)
page = models.IntegerField(default=1)
data = JSONField(default=dict, editable=False)
objects = managers.PageManager()
def __str__(self):
return u"%s:%s" % (self.document, self.page)
def json(self, keys=None, user=None):
data = {}
data['document'] = ox.toAZ(self.document.id)
data['page'] = self.page
data['id'] = '{document}/{page}'.format(**data)
document_keys = []
if keys:
for key in list(data):
if key not in keys:
del data[key]
for key in keys:
if 'fulltext' in key:
data['fulltext'] = self.extract_fulltext()
elif key in ('document', 'page', 'id'):
pass
else:
document_keys.append(key)
if document_keys:
data.update(self.document.json(document_keys, user))
return data
class Access(models.Model):
class Meta:
unique_together = ("document", "user")

View file

@ -6,6 +6,9 @@ def extract_fulltext(id):
from . import models
d = models.Document.objects.get(id=id)
d.update_fulltext()
d.create_pages()
for page in d.pages_set.all():
page.update_fulltext()
@task(queue='default')

View file

@ -24,6 +24,7 @@ from changelog.models import add_changelog
from . import models
from . import tasks
from . import page_views
def get_document_or_404_json(request, id):
response = {'status': {'code': 404,