forked from 0x2620/pandora
fulltext search in pages
This commit is contained in:
parent
cc2b60453b
commit
950bec609d
13 changed files with 608 additions and 22 deletions
|
|
@ -1,14 +1,31 @@
|
|||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
def extract_text(pdf):
|
||||
cmd = ['pdftotext', pdf, '-']
|
||||
def extract_text(pdf, page=None):
|
||||
if page is not None:
|
||||
page = str(page)
|
||||
cmd = ['pdftotext', '-f', page, '-l', page, pdf, '-']
|
||||
else:
|
||||
cmd = ['pdftotext', pdf, '-']
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = p.communicate()
|
||||
stdout = stdout.decode()
|
||||
return stdout.strip()
|
||||
stdout = stdout.decode().strip()
|
||||
if not stdout:
|
||||
if page:
|
||||
# split page from pdf and ocr
|
||||
fd, page_pdf = tempfile.mkstemp('.pdf')
|
||||
cmd = ['pdfseparate', '-f', page, '-l', page, pdf, page_pdf]
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = p.communicate()
|
||||
text = ocr_image(page_pdf)
|
||||
os.unlink(page_pdf)
|
||||
return text
|
||||
else:
|
||||
return ocr_image(pdf)
|
||||
return stdout
|
||||
|
||||
def ocr_image(path):
|
||||
cmd = ['tesseract', path, '-', 'txt']
|
||||
|
|
@ -19,6 +36,7 @@ def ocr_image(path):
|
|||
|
||||
class FulltextMixin:
|
||||
_ES_INDEX = "document-index"
|
||||
_ES_DOC_TYPE = "document"
|
||||
|
||||
@classmethod
|
||||
def elasticsearch(cls):
|
||||
|
|
@ -43,7 +61,7 @@ class FulltextMixin:
|
|||
if self.has_fulltext_key():
|
||||
from elasticsearch.exceptions import NotFoundError
|
||||
try:
|
||||
res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type='document', id=self.id)
|
||||
res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type=self._ES_DOC_TYPE, id=self.id)
|
||||
except NotFoundError:
|
||||
pass
|
||||
|
||||
|
|
@ -54,7 +72,7 @@ class FulltextMixin:
|
|||
doc = {
|
||||
'text': text.lower()
|
||||
}
|
||||
res = self.elasticsearch().index(index=self._ES_INDEX, doc_type='document', id=self.id, body=doc)
|
||||
res = self.elasticsearch().index(index=self._ES_INDEX, doc_type=self._ES_DOC_TYPE, id=self.id, body=doc)
|
||||
|
||||
@classmethod
|
||||
def find_fulltext(cls, query):
|
||||
|
|
@ -95,3 +113,19 @@ class FulltextMixin:
|
|||
ids += [int(r['_id']) for r in res['hits']['hits']]
|
||||
from_ += len(res['hits']['hits'])
|
||||
return ids
|
||||
|
||||
|
||||
class FulltextPageMixin(FulltextMixin):
|
||||
_ES_INDEX = "document-page-index"
|
||||
_DOC_TYPE = 'page'
|
||||
|
||||
def extract_fulltext(self):
|
||||
if self.document.file:
|
||||
if self.document.extension == 'pdf':
|
||||
return extract_text(self.document.file.path, self.page)
|
||||
elif self.extension in ('png', 'jpg'):
|
||||
return ocr_image(self.document.file.path)
|
||||
elif self.extension == 'html':
|
||||
# FIXME: is there a nice way to split that into pages
|
||||
return self.data.get('text', '')
|
||||
return ''
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ from documentcollection.models import Collection
|
|||
from item import utils
|
||||
from user.models import Group
|
||||
|
||||
from .pages import PageManager
|
||||
|
||||
keymap = {
|
||||
'item': 'items__public_id',
|
||||
|
|
@ -61,7 +62,7 @@ def parseCondition(condition, user, item=None, owner=None):
|
|||
|
||||
def buildCondition(k, op, v, user, exclude=False, owner=None):
|
||||
import entity.models
|
||||
from . import models
|
||||
from .. import models
|
||||
|
||||
# fixme: frontend should never call with list
|
||||
if k == 'list':
|
||||
|
|
@ -299,3 +300,4 @@ class DocumentManager(Manager):
|
|||
qs = qs.filter(q)
|
||||
|
||||
return qs
|
||||
|
||||
304
pandora/document/managers/pages.py
Normal file
304
pandora/document/managers/pages.py
Normal file
|
|
@ -0,0 +1,304 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import unicodedata
|
||||
|
||||
from six import string_types
|
||||
from django.db.models import Q, Manager
|
||||
from django.conf import settings
|
||||
|
||||
import ox
|
||||
from oxdjango.query import QuerySet
|
||||
|
||||
import entity.managers
|
||||
from oxdjango.managers import get_operator
|
||||
|
||||
from documentcollection.models import Collection
|
||||
from item import utils
|
||||
from user.models import Group
|
||||
|
||||
|
||||
keymap = {
|
||||
'item': 'items__public_id',
|
||||
}
|
||||
default_key = 'title'
|
||||
|
||||
def get_key_type(k):
|
||||
key_type = (utils.get_by_id(settings.CONFIG['documentKeys'], k) or {'type': 'string'}).get('type')
|
||||
if isinstance(key_type, list):
|
||||
key_type = key_type[0]
|
||||
key_type = {
|
||||
'title': 'string',
|
||||
'person': 'string',
|
||||
'text': 'string',
|
||||
'year': 'string',
|
||||
'length': 'string',
|
||||
'layer': 'string',
|
||||
'list': 'list',
|
||||
}.get(key_type, key_type)
|
||||
return key_type
|
||||
|
||||
|
||||
def parseCondition(condition, user, item=None, owner=None):
|
||||
'''
|
||||
'''
|
||||
k = condition.get('key', default_key)
|
||||
k = keymap.get(k, k)
|
||||
if not k:
|
||||
k = default_key
|
||||
if item and k == 'description':
|
||||
item_conditions = condition.copy()
|
||||
item_conditions['key'] = 'items__itemproperties__description'
|
||||
return parseCondition(condition, user) | parseCondition(item_conditions, user)
|
||||
|
||||
v = condition['value']
|
||||
op = condition.get('operator')
|
||||
if not op:
|
||||
op = '='
|
||||
|
||||
print(k, op, v)
|
||||
|
||||
if op.startswith('!'):
|
||||
return buildCondition(k, op[1:], v, user, True, owner=owner)
|
||||
else:
|
||||
return buildCondition(k, op, v, user, owner=owner)
|
||||
|
||||
def buildCondition(k, op, v, user, exclude=False, owner=None):
|
||||
import entity.models
|
||||
from .. import models
|
||||
|
||||
# fixme: frontend should never call with list
|
||||
if k == 'list':
|
||||
print('fixme: frontend should never call with list', k, op, v)
|
||||
k = 'collection'
|
||||
|
||||
key_type = get_key_type(k)
|
||||
|
||||
key_config = (utils.get_by_id(settings.CONFIG['documentKeys'], k) or {'type': 'string'})
|
||||
|
||||
facet_keys = models.Document.facet_keys
|
||||
if k == 'document':
|
||||
k = 'document__id'
|
||||
if op == '&' and isinstance(v, list):
|
||||
v = [ox.fromAZ(id_) for id_ in v]
|
||||
k += get_operator(op)
|
||||
else:
|
||||
v = ox.fromAZ(v)
|
||||
q = Q(**{k: v})
|
||||
if exclude:
|
||||
q = ~Q(document__id__in=models.Document.objects.filter(q))
|
||||
return q
|
||||
elif k == 'rightslevel':
|
||||
q = Q(document__rightslevel=v)
|
||||
if exclude:
|
||||
q = ~Q(document__rightslevel=v)
|
||||
return q
|
||||
elif k == 'groups':
|
||||
if op == '==' and v == '$my':
|
||||
if not owner:
|
||||
owner = user
|
||||
groups = owner.groups.all()
|
||||
else:
|
||||
key = 'name' + get_operator(op)
|
||||
groups = Group.objects.filter(**{key: v})
|
||||
if not groups.count():
|
||||
return Q(id=0)
|
||||
q = Q(document__groups__in=groups)
|
||||
if exclude:
|
||||
q = ~q
|
||||
return q
|
||||
elif k in ('oshash', 'items__public_id'):
|
||||
q = Q(**{k: v})
|
||||
if exclude:
|
||||
q = ~Q(id__in=models.Document.objects.filter(q))
|
||||
return q
|
||||
elif isinstance(v, bool):
|
||||
key = k
|
||||
elif k == 'entity':
|
||||
entity_key, entity_v = entity.managers.namePredicate(op, v)
|
||||
key = 'id__in'
|
||||
v = entity.models.DocumentProperties.objects.filter(**{
|
||||
'entity__' + entity_key: entity_v
|
||||
}).values_list('document_id', flat=True)
|
||||
elif k == 'collection':
|
||||
q = Q(id=0)
|
||||
l = v.split(":", 1)
|
||||
if len(l) >= 2:
|
||||
lqs = list(Collection.objects.filter(name=l[1], user__username=l[0]))
|
||||
if len(lqs) == 1 and lqs[0].accessible(user):
|
||||
l = lqs[0]
|
||||
if l.query.get('static', False) is False:
|
||||
data = l.query
|
||||
q = parseConditions(data.get('conditions', []),
|
||||
data.get('operator', '&'),
|
||||
user, owner=l.user)
|
||||
else:
|
||||
q = Q(id__in=l.documents.all())
|
||||
else:
|
||||
q = Q(id=0)
|
||||
return q
|
||||
elif key_config.get('fulltext'):
|
||||
print('fulltext?')
|
||||
qs = models.Page.find_fulltext_ids(v)
|
||||
q = Q(id__in=qs)
|
||||
if exclude:
|
||||
q = ~Q(id__in=qs)
|
||||
return q
|
||||
elif key_type == 'boolean':
|
||||
q = Q(**{'find__key': k, 'find__value': v})
|
||||
if exclude:
|
||||
q = ~Q(id__in=models.Document.objects.filter(q))
|
||||
return q
|
||||
elif key_type == "string":
|
||||
in_find = True
|
||||
if in_find:
|
||||
value_key = 'find__value'
|
||||
else:
|
||||
value_key = k
|
||||
if isinstance(v, string_types):
|
||||
v = unicodedata.normalize('NFKD', v).lower()
|
||||
if k in facet_keys:
|
||||
in_find = False
|
||||
facet_value = 'facets__value' + get_operator(op, 'istr')
|
||||
v = models.Document.objects.filter(**{'facets__key': k, facet_value: v})
|
||||
value_key = 'id__in'
|
||||
else:
|
||||
value_key = value_key + get_operator(op)
|
||||
k = str(k)
|
||||
value_key = str(value_key)
|
||||
if k == '*':
|
||||
q = Q(**{'find__value' + get_operator(op): v}) | \
|
||||
Q(**{'facets__value' + get_operator(op, 'istr'): v})
|
||||
elif in_find:
|
||||
q = Q(**{'find__key': k, value_key: v})
|
||||
else:
|
||||
q = Q(**{value_key: v})
|
||||
if exclude:
|
||||
q = ~Q(id__in=models.Document.objects.filter(q))
|
||||
return q
|
||||
elif key_type == 'date':
|
||||
def parse_date(d):
|
||||
while len(d) < 3:
|
||||
d.append(1)
|
||||
return datetime(*[int(i) for i in d])
|
||||
|
||||
#using sort here since find only contains strings
|
||||
v = parse_date(v.split('-'))
|
||||
vk = 'sort__%s%s' % (k, get_operator(op, 'int'))
|
||||
vk = str(vk)
|
||||
q = Q(**{vk: v})
|
||||
if exclude:
|
||||
q = ~q
|
||||
return q
|
||||
else: # integer, float, list, time
|
||||
#use sort table here
|
||||
if key_type == 'time':
|
||||
v = int(utils.parse_time(v))
|
||||
|
||||
vk = 'sort__%s%s' % (k, get_operator(op, 'int'))
|
||||
vk = str(vk)
|
||||
q = Q(**{vk: v})
|
||||
if exclude:
|
||||
q = ~q
|
||||
return q
|
||||
key = str(key)
|
||||
q = Q(**{key: v})
|
||||
if exclude:
|
||||
q = ~q
|
||||
return q
|
||||
|
||||
|
||||
def parseConditions(conditions, operator, user, item=None, owner=None):
|
||||
'''
|
||||
conditions: [
|
||||
{
|
||||
value: "war"
|
||||
}
|
||||
{
|
||||
key: "year",
|
||||
value: "1970-1980,
|
||||
operator: "!="
|
||||
},
|
||||
{
|
||||
key: "country",
|
||||
value: "f",
|
||||
operator: "^"
|
||||
}
|
||||
],
|
||||
operator: "&"
|
||||
'''
|
||||
conn = []
|
||||
for condition in conditions:
|
||||
if 'conditions' in condition:
|
||||
q = parseConditions(condition['conditions'],
|
||||
condition.get('operator', '&'), user, item, owner=owner)
|
||||
if q:
|
||||
conn.append(q)
|
||||
pass
|
||||
else:
|
||||
conn.append(parseCondition(condition, user, item, owner=owner))
|
||||
if conn:
|
||||
q = conn[0]
|
||||
for c in conn[1:]:
|
||||
if operator == '|':
|
||||
q = q | c
|
||||
else:
|
||||
q = q & c
|
||||
return q
|
||||
return None
|
||||
|
||||
|
||||
class PageManager(Manager):
|
||||
|
||||
def get_query_set(self):
|
||||
return QuerySet(self.model)
|
||||
|
||||
def find(self, data, user, item=None):
|
||||
'''
|
||||
query: {
|
||||
conditions: [
|
||||
{
|
||||
value: "war"
|
||||
}
|
||||
{
|
||||
key: "year",
|
||||
value: "1970-1980,
|
||||
operator: "!="
|
||||
},
|
||||
{
|
||||
key: "country",
|
||||
value: "f",
|
||||
operator: "^"
|
||||
}
|
||||
],
|
||||
operator: "&"
|
||||
}
|
||||
'''
|
||||
|
||||
#join query with operator
|
||||
qs = self.get_query_set()
|
||||
query = data.get('query', {})
|
||||
conditions = parseConditions(query.get('conditions', []),
|
||||
query.get('operator', '&'),
|
||||
user, item)
|
||||
if conditions:
|
||||
qs = qs.filter(conditions)
|
||||
qs = qs.distinct()
|
||||
|
||||
#anonymous can only see public items
|
||||
if not user or user.is_anonymous:
|
||||
level = 'guest'
|
||||
allowed_level = settings.CONFIG['capabilities']['canSeeDocument'][level]
|
||||
qs = qs.filter(document__rightslevel__lte=allowed_level)
|
||||
rendered_q = Q(rendered=True)
|
||||
#users can see public items, there own items and items of there groups
|
||||
else:
|
||||
level = user.profile.get_level()
|
||||
allowed_level = settings.CONFIG['capabilities']['canSeeDocument'][level]
|
||||
q = Q(document__rightslevel__lte=allowed_level) | Q(document__user=user)
|
||||
rendered_q = Q(rendered=True) | Q(document__user=user)
|
||||
if user.groups.count():
|
||||
q |= Q(document__groups__in=user.groups.all())
|
||||
rendered_q |= Q(document__groups__in=user.groups.all())
|
||||
qs = qs.filter(q)
|
||||
|
||||
return qs
|
||||
|
||||
35
pandora/document/migrations/0012_auto_20200513_0001.py
Normal file
35
pandora/document/migrations/0012_auto_20200513_0001.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.11.22 on 2020-05-13 00:01
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import django.core.serializers.json
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import document.fulltext
|
||||
import oxdjango.fields
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('document', '0011_jsonfield'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Page',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('created', models.DateTimeField(auto_now_add=True)),
|
||||
('modified', models.DateTimeField(auto_now=True)),
|
||||
('page', models.IntegerField(default=1)),
|
||||
('data', oxdjango.fields.JSONField(default=dict, editable=False, encoder=django.core.serializers.json.DjangoJSONEncoder)),
|
||||
],
|
||||
bases=(models.Model, document.fulltext.FulltextPageMixin),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='page',
|
||||
name='document',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='pages_set', to='document.Document'),
|
||||
),
|
||||
]
|
||||
|
|
@ -29,7 +29,7 @@ from user.utils import update_groups
|
|||
from . import managers
|
||||
from . import utils
|
||||
from . import tasks
|
||||
from .fulltext import FulltextMixin
|
||||
from .fulltext import FulltextMixin, FulltextPageMixin
|
||||
|
||||
User = get_user_model()
|
||||
|
||||
|
|
@ -586,6 +586,11 @@ class Document(models.Model, FulltextMixin):
|
|||
image = os.path.join(os.path.dirname(pdf), '1024p%d.jpg' % page)
|
||||
utils.extract_pdfpage(pdf, image, page)
|
||||
|
||||
def create_pages(self):
|
||||
for page in range(self.pages):
|
||||
page += 1
|
||||
p, c = Page.objects.get_or_create(document=self, page=page)
|
||||
|
||||
def get_info(self):
|
||||
if self.extension == 'pdf':
|
||||
self.thumbnail(1024)
|
||||
|
|
@ -702,6 +707,41 @@ class ItemProperties(models.Model):
|
|||
super(ItemProperties, self).save(*args, **kwargs)
|
||||
|
||||
|
||||
class Page(models.Model, FulltextPageMixin):
|
||||
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
modified = models.DateTimeField(auto_now=True)
|
||||
|
||||
document = models.ForeignKey(Document, related_name='pages_set', on_delete=models.CASCADE)
|
||||
page = models.IntegerField(default=1)
|
||||
data = JSONField(default=dict, editable=False)
|
||||
|
||||
objects = managers.PageManager()
|
||||
|
||||
def __str__(self):
|
||||
return u"%s:%s" % (self.document, self.page)
|
||||
|
||||
def json(self, keys=None, user=None):
|
||||
data = {}
|
||||
data['document'] = ox.toAZ(self.document.id)
|
||||
data['page'] = self.page
|
||||
data['id'] = '{document}/{page}'.format(**data)
|
||||
document_keys = []
|
||||
if keys:
|
||||
for key in list(data):
|
||||
if key not in keys:
|
||||
del data[key]
|
||||
for key in keys:
|
||||
if 'fulltext' in key:
|
||||
data['fulltext'] = self.extract_fulltext()
|
||||
elif key in ('document', 'page', 'id'):
|
||||
pass
|
||||
else:
|
||||
document_keys.append(key)
|
||||
if document_keys:
|
||||
data.update(self.document.json(document_keys, user))
|
||||
return data
|
||||
|
||||
class Access(models.Model):
|
||||
class Meta:
|
||||
unique_together = ("document", "user")
|
||||
|
|
|
|||
|
|
@ -6,6 +6,9 @@ def extract_fulltext(id):
|
|||
from . import models
|
||||
d = models.Document.objects.get(id=id)
|
||||
d.update_fulltext()
|
||||
d.create_pages()
|
||||
for page in d.pages_set.all():
|
||||
page.update_fulltext()
|
||||
|
||||
|
||||
@task(queue='default')
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ from changelog.models import add_changelog
|
|||
|
||||
from . import models
|
||||
from . import tasks
|
||||
from . import page_views
|
||||
|
||||
def get_document_or_404_json(request, id):
|
||||
response = {'status': {'code': 404,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue