fulltext search in pages

This commit is contained in:
j 2021-11-15 15:20:08 +00:00
parent cc2b60453b
commit 950bec609d
13 changed files with 608 additions and 22 deletions

View file

@ -1,14 +1,31 @@
import subprocess
import tempfile
from django.conf import settings
def extract_text(pdf):
def extract_text(pdf, page=None):
if page is not None:
page = str(page)
cmd = ['pdftotext', '-f', page, '-l', page, pdf, '-']
else:
cmd = ['pdftotext', pdf, '-']
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
stdout = stdout.decode()
return stdout.strip()
stdout = stdout.decode().strip()
if not stdout:
if page:
# split page from pdf and ocr
fd, page_pdf = tempfile.mkstemp('.pdf')
cmd = ['pdfseparate', '-f', page, '-l', page, pdf, page_pdf]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
text = ocr_image(page_pdf)
os.unlink(page_pdf)
return text
else:
return ocr_image(pdf)
return stdout
def ocr_image(path):
cmd = ['tesseract', path, '-', 'txt']
@ -19,6 +36,7 @@ def ocr_image(path):
class FulltextMixin:
_ES_INDEX = "document-index"
_ES_DOC_TYPE = "document"
@classmethod
def elasticsearch(cls):
@ -43,7 +61,7 @@ class FulltextMixin:
if self.has_fulltext_key():
from elasticsearch.exceptions import NotFoundError
try:
res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type='document', id=self.id)
res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type=self._ES_DOC_TYPE, id=self.id)
except NotFoundError:
pass
@ -54,7 +72,7 @@ class FulltextMixin:
doc = {
'text': text.lower()
}
res = self.elasticsearch().index(index=self._ES_INDEX, doc_type='document', id=self.id, body=doc)
res = self.elasticsearch().index(index=self._ES_INDEX, doc_type=self._ES_DOC_TYPE, id=self.id, body=doc)
@classmethod
def find_fulltext(cls, query):
@ -95,3 +113,19 @@ class FulltextMixin:
ids += [int(r['_id']) for r in res['hits']['hits']]
from_ += len(res['hits']['hits'])
return ids
class FulltextPageMixin(FulltextMixin):
_ES_INDEX = "document-page-index"
_DOC_TYPE = 'page'
def extract_fulltext(self):
if self.document.file:
if self.document.extension == 'pdf':
return extract_text(self.document.file.path, self.page)
elif self.extension in ('png', 'jpg'):
return ocr_image(self.document.file.path)
elif self.extension == 'html':
# FIXME: is there a nice way to split that into pages
return self.data.get('text', '')
return ''

View file

@ -14,6 +14,7 @@ from documentcollection.models import Collection
from item import utils
from user.models import Group
from .pages import PageManager
keymap = {
'item': 'items__public_id',
@ -61,7 +62,7 @@ def parseCondition(condition, user, item=None, owner=None):
def buildCondition(k, op, v, user, exclude=False, owner=None):
import entity.models
from . import models
from .. import models
# fixme: frontend should never call with list
if k == 'list':
@ -299,3 +300,4 @@ class DocumentManager(Manager):
qs = qs.filter(q)
return qs

View file

@ -0,0 +1,304 @@
# -*- coding: utf-8 -*-
import unicodedata
from six import string_types
from django.db.models import Q, Manager
from django.conf import settings
import ox
from oxdjango.query import QuerySet
import entity.managers
from oxdjango.managers import get_operator
from documentcollection.models import Collection
from item import utils
from user.models import Group
keymap = {
'item': 'items__public_id',
}
default_key = 'title'
def get_key_type(k):
key_type = (utils.get_by_id(settings.CONFIG['documentKeys'], k) or {'type': 'string'}).get('type')
if isinstance(key_type, list):
key_type = key_type[0]
key_type = {
'title': 'string',
'person': 'string',
'text': 'string',
'year': 'string',
'length': 'string',
'layer': 'string',
'list': 'list',
}.get(key_type, key_type)
return key_type
def parseCondition(condition, user, item=None, owner=None):
'''
'''
k = condition.get('key', default_key)
k = keymap.get(k, k)
if not k:
k = default_key
if item and k == 'description':
item_conditions = condition.copy()
item_conditions['key'] = 'items__itemproperties__description'
return parseCondition(condition, user) | parseCondition(item_conditions, user)
v = condition['value']
op = condition.get('operator')
if not op:
op = '='
print(k, op, v)
if op.startswith('!'):
return buildCondition(k, op[1:], v, user, True, owner=owner)
else:
return buildCondition(k, op, v, user, owner=owner)
def buildCondition(k, op, v, user, exclude=False, owner=None):
import entity.models
from .. import models
# fixme: frontend should never call with list
if k == 'list':
print('fixme: frontend should never call with list', k, op, v)
k = 'collection'
key_type = get_key_type(k)
key_config = (utils.get_by_id(settings.CONFIG['documentKeys'], k) or {'type': 'string'})
facet_keys = models.Document.facet_keys
if k == 'document':
k = 'document__id'
if op == '&' and isinstance(v, list):
v = [ox.fromAZ(id_) for id_ in v]
k += get_operator(op)
else:
v = ox.fromAZ(v)
q = Q(**{k: v})
if exclude:
q = ~Q(document__id__in=models.Document.objects.filter(q))
return q
elif k == 'rightslevel':
q = Q(document__rightslevel=v)
if exclude:
q = ~Q(document__rightslevel=v)
return q
elif k == 'groups':
if op == '==' and v == '$my':
if not owner:
owner = user
groups = owner.groups.all()
else:
key = 'name' + get_operator(op)
groups = Group.objects.filter(**{key: v})
if not groups.count():
return Q(id=0)
q = Q(document__groups__in=groups)
if exclude:
q = ~q
return q
elif k in ('oshash', 'items__public_id'):
q = Q(**{k: v})
if exclude:
q = ~Q(id__in=models.Document.objects.filter(q))
return q
elif isinstance(v, bool):
key = k
elif k == 'entity':
entity_key, entity_v = entity.managers.namePredicate(op, v)
key = 'id__in'
v = entity.models.DocumentProperties.objects.filter(**{
'entity__' + entity_key: entity_v
}).values_list('document_id', flat=True)
elif k == 'collection':
q = Q(id=0)
l = v.split(":", 1)
if len(l) >= 2:
lqs = list(Collection.objects.filter(name=l[1], user__username=l[0]))
if len(lqs) == 1 and lqs[0].accessible(user):
l = lqs[0]
if l.query.get('static', False) is False:
data = l.query
q = parseConditions(data.get('conditions', []),
data.get('operator', '&'),
user, owner=l.user)
else:
q = Q(id__in=l.documents.all())
else:
q = Q(id=0)
return q
elif key_config.get('fulltext'):
print('fulltext?')
qs = models.Page.find_fulltext_ids(v)
q = Q(id__in=qs)
if exclude:
q = ~Q(id__in=qs)
return q
elif key_type == 'boolean':
q = Q(**{'find__key': k, 'find__value': v})
if exclude:
q = ~Q(id__in=models.Document.objects.filter(q))
return q
elif key_type == "string":
in_find = True
if in_find:
value_key = 'find__value'
else:
value_key = k
if isinstance(v, string_types):
v = unicodedata.normalize('NFKD', v).lower()
if k in facet_keys:
in_find = False
facet_value = 'facets__value' + get_operator(op, 'istr')
v = models.Document.objects.filter(**{'facets__key': k, facet_value: v})
value_key = 'id__in'
else:
value_key = value_key + get_operator(op)
k = str(k)
value_key = str(value_key)
if k == '*':
q = Q(**{'find__value' + get_operator(op): v}) | \
Q(**{'facets__value' + get_operator(op, 'istr'): v})
elif in_find:
q = Q(**{'find__key': k, value_key: v})
else:
q = Q(**{value_key: v})
if exclude:
q = ~Q(id__in=models.Document.objects.filter(q))
return q
elif key_type == 'date':
def parse_date(d):
while len(d) < 3:
d.append(1)
return datetime(*[int(i) for i in d])
#using sort here since find only contains strings
v = parse_date(v.split('-'))
vk = 'sort__%s%s' % (k, get_operator(op, 'int'))
vk = str(vk)
q = Q(**{vk: v})
if exclude:
q = ~q
return q
else: # integer, float, list, time
#use sort table here
if key_type == 'time':
v = int(utils.parse_time(v))
vk = 'sort__%s%s' % (k, get_operator(op, 'int'))
vk = str(vk)
q = Q(**{vk: v})
if exclude:
q = ~q
return q
key = str(key)
q = Q(**{key: v})
if exclude:
q = ~q
return q
def parseConditions(conditions, operator, user, item=None, owner=None):
'''
conditions: [
{
value: "war"
}
{
key: "year",
value: "1970-1980,
operator: "!="
},
{
key: "country",
value: "f",
operator: "^"
}
],
operator: "&"
'''
conn = []
for condition in conditions:
if 'conditions' in condition:
q = parseConditions(condition['conditions'],
condition.get('operator', '&'), user, item, owner=owner)
if q:
conn.append(q)
pass
else:
conn.append(parseCondition(condition, user, item, owner=owner))
if conn:
q = conn[0]
for c in conn[1:]:
if operator == '|':
q = q | c
else:
q = q & c
return q
return None
class PageManager(Manager):
def get_query_set(self):
return QuerySet(self.model)
def find(self, data, user, item=None):
'''
query: {
conditions: [
{
value: "war"
}
{
key: "year",
value: "1970-1980,
operator: "!="
},
{
key: "country",
value: "f",
operator: "^"
}
],
operator: "&"
}
'''
#join query with operator
qs = self.get_query_set()
query = data.get('query', {})
conditions = parseConditions(query.get('conditions', []),
query.get('operator', '&'),
user, item)
if conditions:
qs = qs.filter(conditions)
qs = qs.distinct()
#anonymous can only see public items
if not user or user.is_anonymous:
level = 'guest'
allowed_level = settings.CONFIG['capabilities']['canSeeDocument'][level]
qs = qs.filter(document__rightslevel__lte=allowed_level)
rendered_q = Q(rendered=True)
#users can see public items, there own items and items of there groups
else:
level = user.profile.get_level()
allowed_level = settings.CONFIG['capabilities']['canSeeDocument'][level]
q = Q(document__rightslevel__lte=allowed_level) | Q(document__user=user)
rendered_q = Q(rendered=True) | Q(document__user=user)
if user.groups.count():
q |= Q(document__groups__in=user.groups.all())
rendered_q |= Q(document__groups__in=user.groups.all())
qs = qs.filter(q)
return qs

View file

@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.22 on 2020-05-13 00:01
from __future__ import unicode_literals
import django.core.serializers.json
from django.db import migrations, models
import django.db.models.deletion
import document.fulltext
import oxdjango.fields
class Migration(migrations.Migration):
dependencies = [
('document', '0011_jsonfield'),
]
operations = [
migrations.CreateModel(
name='Page',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
('page', models.IntegerField(default=1)),
('data', oxdjango.fields.JSONField(default=dict, editable=False, encoder=django.core.serializers.json.DjangoJSONEncoder)),
],
bases=(models.Model, document.fulltext.FulltextPageMixin),
),
migrations.AddField(
model_name='page',
name='document',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='pages_set', to='document.Document'),
),
]

View file

@ -29,7 +29,7 @@ from user.utils import update_groups
from . import managers
from . import utils
from . import tasks
from .fulltext import FulltextMixin
from .fulltext import FulltextMixin, FulltextPageMixin
User = get_user_model()
@ -586,6 +586,11 @@ class Document(models.Model, FulltextMixin):
image = os.path.join(os.path.dirname(pdf), '1024p%d.jpg' % page)
utils.extract_pdfpage(pdf, image, page)
def create_pages(self):
for page in range(self.pages):
page += 1
p, c = Page.objects.get_or_create(document=self, page=page)
def get_info(self):
if self.extension == 'pdf':
self.thumbnail(1024)
@ -702,6 +707,41 @@ class ItemProperties(models.Model):
super(ItemProperties, self).save(*args, **kwargs)
class Page(models.Model, FulltextPageMixin):
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
document = models.ForeignKey(Document, related_name='pages_set', on_delete=models.CASCADE)
page = models.IntegerField(default=1)
data = JSONField(default=dict, editable=False)
objects = managers.PageManager()
def __str__(self):
return u"%s:%s" % (self.document, self.page)
def json(self, keys=None, user=None):
data = {}
data['document'] = ox.toAZ(self.document.id)
data['page'] = self.page
data['id'] = '{document}/{page}'.format(**data)
document_keys = []
if keys:
for key in list(data):
if key not in keys:
del data[key]
for key in keys:
if 'fulltext' in key:
data['fulltext'] = self.extract_fulltext()
elif key in ('document', 'page', 'id'):
pass
else:
document_keys.append(key)
if document_keys:
data.update(self.document.json(document_keys, user))
return data
class Access(models.Model):
class Meta:
unique_together = ("document", "user")

View file

@ -6,6 +6,9 @@ def extract_fulltext(id):
from . import models
d = models.Document.objects.get(id=id)
d.update_fulltext()
d.create_pages()
for page in d.pages_set.all():
page.update_fulltext()
@task(queue='default')

View file

@ -24,6 +24,7 @@ from changelog.models import add_changelog
from . import models
from . import tasks
from . import page_views
def get_document_or_404_json(request, id):
response = {'status': {'code': 404,

View file

@ -399,13 +399,14 @@ pandora.URL = (function() {
// Documents
views['documents'] = {
list: ['grid', 'list'],
list: ['grid', 'list', 'pages'],
item: ['view', 'info']
};
sortKeys['documents'] = {
list: {
list: pandora.site.documentKeys,
grid: pandora.site.documentKeys
grid: pandora.site.documentKeys,
pages: pandora.site.documentKeys
},
item: {}
};

View file

@ -124,6 +124,67 @@ pandora.ui.collection = function() {
unique: 'id'
})
.addClass('OxMedia');
} else if (view == 'pages') {
that = Ox.InfoList({
borderRadius: 0,
defaultRatio: 640/1024,
draggable: true,
id: 'list',
item: function(data, sort, size) {
size = 128;
var sortKey = sort[0].key,
infoKey = sortKey == 'title' ? 'extension' : sortKey,
key = Ox.getObjectById(pandora.site.documentKeys, infoKey),
info = pandora.formatDocumentKey(key, data, size);
return {
icon: {
height: Math.round(data.ratio > 1 ? size / data.ratio : size),
id: data.id,
info: info,
title: data.title,
url: pandora.getMediaURL('/documents/' + data.id + '/256p.jpg?' + data.modified),
width: Math.round(data.ratio >= 1 ? size : size * data.ratio)
},
info: {
css: {marginTop: '2px'},
element: pandora.ui.documentPages,
id: data.id,
options: {
id: data.id,
pages: data.pages,
query: ui.findDocuments,
ratio: data.ratio
}
}
};
},
items: function(data, callback) {
pandora.api.findDocuments(Ox.extend(data, {
query: ui.findDocuments
}), callback);
return Ox.clone(data, true);
},
keys: ['id', 'pages', 'title', 'ratio', 'modified'],
selected: ui.listSelection,
size: 192,
sort: ui.collectionSort.concat([
{key: 'extension', operator: '+'},
{key: 'title', operator: '+'}
]),
unique: 'id',
width: window.innerWidth
- ui.showSidebar * ui.sidebarSize - 1
- Ox.UI.SCROLLBAR_SIZE
})
.addClass('OxMedia')
.bindEvent({
key_left: function() {
// ...
},
key_right: function() {
// ...
}
});
}
if (['list', 'grid'].indexOf(view) > -1) {
@ -138,7 +199,7 @@ pandora.ui.collection = function() {
});
}
if (['list', 'grid'].indexOf(view) > -1) {
if (['list', 'grid', 'pages'].indexOf(view) > -1) {
//fixme

106
static/js/documentPages.js Normal file
View file

@ -0,0 +1,106 @@
'use strict';
pandora.ui.documentPages = function(options) {
var self = {},
that = Ox.Element()
.css({
height: '192px',
margin: '4px',
display: 'flex'
})
.bindEvent({
doubleclick: doubleclick,
singleclick: singleclick
});
self.options = Ox.extend({
id: '',
pages: 1,
query: null,
ratio: 8/5
}, options);
self.size = 128;
self.width = self.options.ratio > 1 ? self.size : Math.round(self.size * self.options.ratio);
self.height = self.options.ratio > 1 ? Math.round(self.size / self.options.ratio) : self.size;
function renderPage(page) {
var url = `/documents/${self.options.id}/${self.size}p${page}.jpg`
var $item = Ox.IconItem({
imageHeight: self.height,
imageWidth: self.width,
id: `${self.options.id}/${page}`,
info: '',
title: `Page ${page}`,
url: url
})
.addClass('OxInfoIcon')
.css({
})
.data({
page: page
});
$item.find('.OxTarget').addClass('OxSpecialTarget');
that.append($item);
}
function renderPages(pages) {
console.log('renderPages', pages, self.options.pages)
if (pages) {
console.log('renderPages', pages)
pages.forEach(page => {
renderPage(page.page)
})
} else {
if (self.options.pages > 1) {
Ox.range(Ox.min([self.options.pages, 5])).forEach(page => { renderPage(page + 2) })
}
}
}
var query
if (self.options.query) {
var condition = self.options.query.conditions.filter(condition => {
return condition.key == 'fulltext'
})
if (condition.length) {
query = {
'conditions': [
{'key': 'document', 'operator': '==', 'value': self.options.id},
{'key': 'fulltext', 'operator': '=', 'value': condition[0].value}
]
}
}
}
if (query) {
pandora.api.findPages({
query: query,
range: [0, 100],
keys: ['page']
}, function(result) {
renderPages(result.data.items)
})
} else {
renderPages()
}
function doubleclick(data) {
var $item, $target = $(data.target), annotation, item, points, set;
if ($target.parent().parent().is('.OxSpecialTarget')) {
$target = $target.parent().parent();
}
if ($target.is('.OxSpecialTarget')) {
$item = $target.parent().parent();
var page = $item.data('page')
pandora.URL.push(`/documents/${self.options.id}/${page}`);
}
}
function singleclick(data) {
// ..
}
return that;
};

View file

@ -64,10 +64,7 @@ pandora.ui.documentSortSelect = function() {
pandora.ui.documentViewSelect = function() {
var ui = pandora.user.ui,
that = Ox.Select({
items: [
{id: 'list', title: Ox._('View as List')},
{id: 'grid', title: Ox._('View as Grid')}
],
items: pandora.site.collectionViews,
value: ui.documentsView,
width: 128
})

View file

@ -995,7 +995,7 @@ pandora.ui.mainMenu = function() {
if (ui.document && i < 2) {
pandora.UI.set({documentView: ['info', 'view'][i]});
} else if (i < 2) {
pandora.UI.set({collectionView: ['list', 'grid'][i]});
pandora.UI.set({collectionView: ['list', 'grid', 'pages'][i]});
}
}
});
@ -1498,7 +1498,7 @@ pandora.ui.mainMenu = function() {
return [
{ id: 'documents', title: Ox._('View Documents'), items: [
{ group: 'collectionview', min: 1, max: 1, items: pandora.site.listViews.filter(function(view) {
return Ox.contains(['list', 'grid'], view.id)
return Ox.contains(['list', 'grid', 'pages'], view.id)
}).map(function(view) {
return Ox.extend({
checked: ui.collectionView == view.id

View file

@ -419,11 +419,13 @@ appPanel
]
},
sortKeys: pandora.getSortKeys(),
documentSortKeys: pandora.getDocumentSortKeys(),
collectionViews: [
{id: 'list', title: Ox._('View as List')},
{id: 'grid', title: Ox._('View as Grid')}
]
documentSortKeys: pandora.getDocumentSortKeys()
});
pandora.site.collectionViews = (pandora.site.collectionViews || [
{id: 'list', title: 'as List'},
{id: 'grid', title: 'as Grid'}
]).map(view => {
return {id: view.id, title: Ox._('View {0}', [Ox._(view.title)])};
});
pandora.site.listSettings = {};
Ox.forEach(pandora.site.user.ui, function(val, key) {