From 950bec609d27153b4b51e14d63a5ed24b191fe92 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 15 Nov 2021 15:20:08 +0000 Subject: [PATCH] fulltext search in pages --- pandora/document/fulltext.py | 46 ++- .../{managers.py => managers/__init__.py} | 4 +- pandora/document/managers/pages.py | 304 ++++++++++++++++++ .../migrations/0012_auto_20200513_0001.py | 35 ++ pandora/document/models.py | 42 ++- pandora/document/tasks.py | 3 + pandora/document/views.py | 1 + static/js/URL.js | 5 +- static/js/collection.js | 63 +++- static/js/documentPages.js | 106 ++++++ static/js/documentsPanel.js | 5 +- static/js/mainMenu.js | 4 +- static/js/pandora.js | 12 +- 13 files changed, 608 insertions(+), 22 deletions(-) rename pandora/document/{managers.py => managers/__init__.py} (99%) create mode 100644 pandora/document/managers/pages.py create mode 100644 pandora/document/migrations/0012_auto_20200513_0001.py create mode 100644 static/js/documentPages.js diff --git a/pandora/document/fulltext.py b/pandora/document/fulltext.py index 040658ce9..d58e3751e 100644 --- a/pandora/document/fulltext.py +++ b/pandora/document/fulltext.py @@ -1,14 +1,31 @@ import subprocess +import tempfile from django.conf import settings -def extract_text(pdf): - cmd = ['pdftotext', pdf, '-'] +def extract_text(pdf, page=None): + if page is not None: + page = str(page) + cmd = ['pdftotext', '-f', page, '-l', page, pdf, '-'] + else: + cmd = ['pdftotext', pdf, '-'] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() - stdout = stdout.decode() - return stdout.strip() + stdout = stdout.decode().strip() + if not stdout: + if page: + # split page from pdf and ocr + fd, page_pdf = tempfile.mkstemp('.pdf') + cmd = ['pdfseparate', '-f', page, '-l', page, pdf, page_pdf] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + text = ocr_image(page_pdf) + os.unlink(page_pdf) + return text + else: + return ocr_image(pdf) + return stdout def ocr_image(path): cmd = ['tesseract', path, '-', 'txt'] @@ -19,6 +36,7 @@ def ocr_image(path): class FulltextMixin: _ES_INDEX = "document-index" + _ES_DOC_TYPE = "document" @classmethod def elasticsearch(cls): @@ -43,7 +61,7 @@ class FulltextMixin: if self.has_fulltext_key(): from elasticsearch.exceptions import NotFoundError try: - res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type='document', id=self.id) + res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type=self._ES_DOC_TYPE, id=self.id) except NotFoundError: pass @@ -54,7 +72,7 @@ class FulltextMixin: doc = { 'text': text.lower() } - res = self.elasticsearch().index(index=self._ES_INDEX, doc_type='document', id=self.id, body=doc) + res = self.elasticsearch().index(index=self._ES_INDEX, doc_type=self._ES_DOC_TYPE, id=self.id, body=doc) @classmethod def find_fulltext(cls, query): @@ -95,3 +113,19 @@ class FulltextMixin: ids += [int(r['_id']) for r in res['hits']['hits']] from_ += len(res['hits']['hits']) return ids + + +class FulltextPageMixin(FulltextMixin): + _ES_INDEX = "document-page-index" + _DOC_TYPE = 'page' + + def extract_fulltext(self): + if self.document.file: + if self.document.extension == 'pdf': + return extract_text(self.document.file.path, self.page) + elif self.extension in ('png', 'jpg'): + return ocr_image(self.document.file.path) + elif self.extension == 'html': + # FIXME: is there a nice way to split that into pages + return self.data.get('text', '') + return '' diff --git a/pandora/document/managers.py b/pandora/document/managers/__init__.py similarity index 99% rename from pandora/document/managers.py rename to pandora/document/managers/__init__.py index 4210cd622..61a4ba9ab 100644 --- a/pandora/document/managers.py +++ b/pandora/document/managers/__init__.py @@ -14,6 +14,7 @@ from documentcollection.models import Collection from item import utils from user.models import Group +from .pages import PageManager keymap = { 'item': 'items__public_id', @@ -61,7 +62,7 @@ def parseCondition(condition, user, item=None, owner=None): def buildCondition(k, op, v, user, exclude=False, owner=None): import entity.models - from . import models + from .. import models # fixme: frontend should never call with list if k == 'list': @@ -299,3 +300,4 @@ class DocumentManager(Manager): qs = qs.filter(q) return qs + diff --git a/pandora/document/managers/pages.py b/pandora/document/managers/pages.py new file mode 100644 index 000000000..82ac7454c --- /dev/null +++ b/pandora/document/managers/pages.py @@ -0,0 +1,304 @@ +# -*- coding: utf-8 -*- +import unicodedata + +from six import string_types +from django.db.models import Q, Manager +from django.conf import settings + +import ox +from oxdjango.query import QuerySet + +import entity.managers +from oxdjango.managers import get_operator + +from documentcollection.models import Collection +from item import utils +from user.models import Group + + +keymap = { + 'item': 'items__public_id', +} +default_key = 'title' + +def get_key_type(k): + key_type = (utils.get_by_id(settings.CONFIG['documentKeys'], k) or {'type': 'string'}).get('type') + if isinstance(key_type, list): + key_type = key_type[0] + key_type = { + 'title': 'string', + 'person': 'string', + 'text': 'string', + 'year': 'string', + 'length': 'string', + 'layer': 'string', + 'list': 'list', + }.get(key_type, key_type) + return key_type + + +def parseCondition(condition, user, item=None, owner=None): + ''' + ''' + k = condition.get('key', default_key) + k = keymap.get(k, k) + if not k: + k = default_key + if item and k == 'description': + item_conditions = condition.copy() + item_conditions['key'] = 'items__itemproperties__description' + return parseCondition(condition, user) | parseCondition(item_conditions, user) + + v = condition['value'] + op = condition.get('operator') + if not op: + op = '=' + + print(k, op, v) + + if op.startswith('!'): + return buildCondition(k, op[1:], v, user, True, owner=owner) + else: + return buildCondition(k, op, v, user, owner=owner) + +def buildCondition(k, op, v, user, exclude=False, owner=None): + import entity.models + from .. import models + + # fixme: frontend should never call with list + if k == 'list': + print('fixme: frontend should never call with list', k, op, v) + k = 'collection' + + key_type = get_key_type(k) + + key_config = (utils.get_by_id(settings.CONFIG['documentKeys'], k) or {'type': 'string'}) + + facet_keys = models.Document.facet_keys + if k == 'document': + k = 'document__id' + if op == '&' and isinstance(v, list): + v = [ox.fromAZ(id_) for id_ in v] + k += get_operator(op) + else: + v = ox.fromAZ(v) + q = Q(**{k: v}) + if exclude: + q = ~Q(document__id__in=models.Document.objects.filter(q)) + return q + elif k == 'rightslevel': + q = Q(document__rightslevel=v) + if exclude: + q = ~Q(document__rightslevel=v) + return q + elif k == 'groups': + if op == '==' and v == '$my': + if not owner: + owner = user + groups = owner.groups.all() + else: + key = 'name' + get_operator(op) + groups = Group.objects.filter(**{key: v}) + if not groups.count(): + return Q(id=0) + q = Q(document__groups__in=groups) + if exclude: + q = ~q + return q + elif k in ('oshash', 'items__public_id'): + q = Q(**{k: v}) + if exclude: + q = ~Q(id__in=models.Document.objects.filter(q)) + return q + elif isinstance(v, bool): + key = k + elif k == 'entity': + entity_key, entity_v = entity.managers.namePredicate(op, v) + key = 'id__in' + v = entity.models.DocumentProperties.objects.filter(**{ + 'entity__' + entity_key: entity_v + }).values_list('document_id', flat=True) + elif k == 'collection': + q = Q(id=0) + l = v.split(":", 1) + if len(l) >= 2: + lqs = list(Collection.objects.filter(name=l[1], user__username=l[0])) + if len(lqs) == 1 and lqs[0].accessible(user): + l = lqs[0] + if l.query.get('static', False) is False: + data = l.query + q = parseConditions(data.get('conditions', []), + data.get('operator', '&'), + user, owner=l.user) + else: + q = Q(id__in=l.documents.all()) + else: + q = Q(id=0) + return q + elif key_config.get('fulltext'): + print('fulltext?') + qs = models.Page.find_fulltext_ids(v) + q = Q(id__in=qs) + if exclude: + q = ~Q(id__in=qs) + return q + elif key_type == 'boolean': + q = Q(**{'find__key': k, 'find__value': v}) + if exclude: + q = ~Q(id__in=models.Document.objects.filter(q)) + return q + elif key_type == "string": + in_find = True + if in_find: + value_key = 'find__value' + else: + value_key = k + if isinstance(v, string_types): + v = unicodedata.normalize('NFKD', v).lower() + if k in facet_keys: + in_find = False + facet_value = 'facets__value' + get_operator(op, 'istr') + v = models.Document.objects.filter(**{'facets__key': k, facet_value: v}) + value_key = 'id__in' + else: + value_key = value_key + get_operator(op) + k = str(k) + value_key = str(value_key) + if k == '*': + q = Q(**{'find__value' + get_operator(op): v}) | \ + Q(**{'facets__value' + get_operator(op, 'istr'): v}) + elif in_find: + q = Q(**{'find__key': k, value_key: v}) + else: + q = Q(**{value_key: v}) + if exclude: + q = ~Q(id__in=models.Document.objects.filter(q)) + return q + elif key_type == 'date': + def parse_date(d): + while len(d) < 3: + d.append(1) + return datetime(*[int(i) for i in d]) + + #using sort here since find only contains strings + v = parse_date(v.split('-')) + vk = 'sort__%s%s' % (k, get_operator(op, 'int')) + vk = str(vk) + q = Q(**{vk: v}) + if exclude: + q = ~q + return q + else: # integer, float, list, time + #use sort table here + if key_type == 'time': + v = int(utils.parse_time(v)) + + vk = 'sort__%s%s' % (k, get_operator(op, 'int')) + vk = str(vk) + q = Q(**{vk: v}) + if exclude: + q = ~q + return q + key = str(key) + q = Q(**{key: v}) + if exclude: + q = ~q + return q + + +def parseConditions(conditions, operator, user, item=None, owner=None): + ''' + conditions: [ + { + value: "war" + } + { + key: "year", + value: "1970-1980, + operator: "!=" + }, + { + key: "country", + value: "f", + operator: "^" + } + ], + operator: "&" + ''' + conn = [] + for condition in conditions: + if 'conditions' in condition: + q = parseConditions(condition['conditions'], + condition.get('operator', '&'), user, item, owner=owner) + if q: + conn.append(q) + pass + else: + conn.append(parseCondition(condition, user, item, owner=owner)) + if conn: + q = conn[0] + for c in conn[1:]: + if operator == '|': + q = q | c + else: + q = q & c + return q + return None + + +class PageManager(Manager): + + def get_query_set(self): + return QuerySet(self.model) + + def find(self, data, user, item=None): + ''' + query: { + conditions: [ + { + value: "war" + } + { + key: "year", + value: "1970-1980, + operator: "!=" + }, + { + key: "country", + value: "f", + operator: "^" + } + ], + operator: "&" + } + ''' + + #join query with operator + qs = self.get_query_set() + query = data.get('query', {}) + conditions = parseConditions(query.get('conditions', []), + query.get('operator', '&'), + user, item) + if conditions: + qs = qs.filter(conditions) + qs = qs.distinct() + + #anonymous can only see public items + if not user or user.is_anonymous: + level = 'guest' + allowed_level = settings.CONFIG['capabilities']['canSeeDocument'][level] + qs = qs.filter(document__rightslevel__lte=allowed_level) + rendered_q = Q(rendered=True) + #users can see public items, there own items and items of there groups + else: + level = user.profile.get_level() + allowed_level = settings.CONFIG['capabilities']['canSeeDocument'][level] + q = Q(document__rightslevel__lte=allowed_level) | Q(document__user=user) + rendered_q = Q(rendered=True) | Q(document__user=user) + if user.groups.count(): + q |= Q(document__groups__in=user.groups.all()) + rendered_q |= Q(document__groups__in=user.groups.all()) + qs = qs.filter(q) + + return qs + diff --git a/pandora/document/migrations/0012_auto_20200513_0001.py b/pandora/document/migrations/0012_auto_20200513_0001.py new file mode 100644 index 000000000..2bf7b0abe --- /dev/null +++ b/pandora/document/migrations/0012_auto_20200513_0001.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.22 on 2020-05-13 00:01 +from __future__ import unicode_literals + +import django.core.serializers.json +from django.db import migrations, models +import django.db.models.deletion +import document.fulltext +import oxdjango.fields + + +class Migration(migrations.Migration): + + dependencies = [ + ('document', '0011_jsonfield'), + ] + + operations = [ + migrations.CreateModel( + name='Page', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created', models.DateTimeField(auto_now_add=True)), + ('modified', models.DateTimeField(auto_now=True)), + ('page', models.IntegerField(default=1)), + ('data', oxdjango.fields.JSONField(default=dict, editable=False, encoder=django.core.serializers.json.DjangoJSONEncoder)), + ], + bases=(models.Model, document.fulltext.FulltextPageMixin), + ), + migrations.AddField( + model_name='page', + name='document', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='pages_set', to='document.Document'), + ), + ] diff --git a/pandora/document/models.py b/pandora/document/models.py index e25012440..05c4c089d 100644 --- a/pandora/document/models.py +++ b/pandora/document/models.py @@ -29,7 +29,7 @@ from user.utils import update_groups from . import managers from . import utils from . import tasks -from .fulltext import FulltextMixin +from .fulltext import FulltextMixin, FulltextPageMixin User = get_user_model() @@ -586,6 +586,11 @@ class Document(models.Model, FulltextMixin): image = os.path.join(os.path.dirname(pdf), '1024p%d.jpg' % page) utils.extract_pdfpage(pdf, image, page) + def create_pages(self): + for page in range(self.pages): + page += 1 + p, c = Page.objects.get_or_create(document=self, page=page) + def get_info(self): if self.extension == 'pdf': self.thumbnail(1024) @@ -702,6 +707,41 @@ class ItemProperties(models.Model): super(ItemProperties, self).save(*args, **kwargs) +class Page(models.Model, FulltextPageMixin): + + created = models.DateTimeField(auto_now_add=True) + modified = models.DateTimeField(auto_now=True) + + document = models.ForeignKey(Document, related_name='pages_set', on_delete=models.CASCADE) + page = models.IntegerField(default=1) + data = JSONField(default=dict, editable=False) + + objects = managers.PageManager() + + def __str__(self): + return u"%s:%s" % (self.document, self.page) + + def json(self, keys=None, user=None): + data = {} + data['document'] = ox.toAZ(self.document.id) + data['page'] = self.page + data['id'] = '{document}/{page}'.format(**data) + document_keys = [] + if keys: + for key in list(data): + if key not in keys: + del data[key] + for key in keys: + if 'fulltext' in key: + data['fulltext'] = self.extract_fulltext() + elif key in ('document', 'page', 'id'): + pass + else: + document_keys.append(key) + if document_keys: + data.update(self.document.json(document_keys, user)) + return data + class Access(models.Model): class Meta: unique_together = ("document", "user") diff --git a/pandora/document/tasks.py b/pandora/document/tasks.py index 7bede1a90..462dab963 100644 --- a/pandora/document/tasks.py +++ b/pandora/document/tasks.py @@ -6,6 +6,9 @@ def extract_fulltext(id): from . import models d = models.Document.objects.get(id=id) d.update_fulltext() + d.create_pages() + for page in d.pages_set.all(): + page.update_fulltext() @task(queue='default') diff --git a/pandora/document/views.py b/pandora/document/views.py index 5fc474661..388bbd436 100644 --- a/pandora/document/views.py +++ b/pandora/document/views.py @@ -24,6 +24,7 @@ from changelog.models import add_changelog from . import models from . import tasks +from . import page_views def get_document_or_404_json(request, id): response = {'status': {'code': 404, diff --git a/static/js/URL.js b/static/js/URL.js index 4a08db119..c21a6ee67 100644 --- a/static/js/URL.js +++ b/static/js/URL.js @@ -399,13 +399,14 @@ pandora.URL = (function() { // Documents views['documents'] = { - list: ['grid', 'list'], + list: ['grid', 'list', 'pages'], item: ['view', 'info'] }; sortKeys['documents'] = { list: { list: pandora.site.documentKeys, - grid: pandora.site.documentKeys + grid: pandora.site.documentKeys, + pages: pandora.site.documentKeys }, item: {} }; diff --git a/static/js/collection.js b/static/js/collection.js index 5b188f7f2..c3c29167b 100644 --- a/static/js/collection.js +++ b/static/js/collection.js @@ -124,6 +124,67 @@ pandora.ui.collection = function() { unique: 'id' }) .addClass('OxMedia'); + } else if (view == 'pages') { + that = Ox.InfoList({ + borderRadius: 0, + defaultRatio: 640/1024, + draggable: true, + id: 'list', + item: function(data, sort, size) { + size = 128; + var sortKey = sort[0].key, + infoKey = sortKey == 'title' ? 'extension' : sortKey, + key = Ox.getObjectById(pandora.site.documentKeys, infoKey), + info = pandora.formatDocumentKey(key, data, size); + return { + icon: { + height: Math.round(data.ratio > 1 ? size / data.ratio : size), + id: data.id, + info: info, + title: data.title, + url: pandora.getMediaURL('/documents/' + data.id + '/256p.jpg?' + data.modified), + width: Math.round(data.ratio >= 1 ? size : size * data.ratio) + }, + info: { + css: {marginTop: '2px'}, + element: pandora.ui.documentPages, + id: data.id, + options: { + id: data.id, + pages: data.pages, + query: ui.findDocuments, + ratio: data.ratio + } + } + }; + }, + items: function(data, callback) { + pandora.api.findDocuments(Ox.extend(data, { + query: ui.findDocuments + }), callback); + return Ox.clone(data, true); + }, + keys: ['id', 'pages', 'title', 'ratio', 'modified'], + selected: ui.listSelection, + size: 192, + sort: ui.collectionSort.concat([ + {key: 'extension', operator: '+'}, + {key: 'title', operator: '+'} + ]), + unique: 'id', + width: window.innerWidth + - ui.showSidebar * ui.sidebarSize - 1 + - Ox.UI.SCROLLBAR_SIZE + }) + .addClass('OxMedia') + .bindEvent({ + key_left: function() { + // ... + }, + key_right: function() { + // ... + } + }); } if (['list', 'grid'].indexOf(view) > -1) { @@ -138,7 +199,7 @@ pandora.ui.collection = function() { }); } - if (['list', 'grid'].indexOf(view) > -1) { + if (['list', 'grid', 'pages'].indexOf(view) > -1) { //fixme diff --git a/static/js/documentPages.js b/static/js/documentPages.js new file mode 100644 index 000000000..9ec8b49a6 --- /dev/null +++ b/static/js/documentPages.js @@ -0,0 +1,106 @@ +'use strict'; + +pandora.ui.documentPages = function(options) { + + var self = {}, + that = Ox.Element() + .css({ + height: '192px', + margin: '4px', + display: 'flex' + }) + .bindEvent({ + doubleclick: doubleclick, + singleclick: singleclick + }); + + self.options = Ox.extend({ + id: '', + pages: 1, + query: null, + ratio: 8/5 + }, options); + + self.size = 128; + self.width = self.options.ratio > 1 ? self.size : Math.round(self.size * self.options.ratio); + self.height = self.options.ratio > 1 ? Math.round(self.size / self.options.ratio) : self.size; + + function renderPage(page) { + var url = `/documents/${self.options.id}/${self.size}p${page}.jpg` + var $item = Ox.IconItem({ + imageHeight: self.height, + imageWidth: self.width, + id: `${self.options.id}/${page}`, + info: '', + title: `Page ${page}`, + url: url + }) + .addClass('OxInfoIcon') + .css({ + }) + .data({ + page: page + }); + $item.find('.OxTarget').addClass('OxSpecialTarget'); + that.append($item); + } + + function renderPages(pages) { + console.log('renderPages', pages, self.options.pages) + if (pages) { + console.log('renderPages', pages) + pages.forEach(page => { + renderPage(page.page) + }) + } else { + if (self.options.pages > 1) { + Ox.range(Ox.min([self.options.pages, 5])).forEach(page => { renderPage(page + 2) }) + } + } + } + var query + if (self.options.query) { + var condition = self.options.query.conditions.filter(condition => { + return condition.key == 'fulltext' + }) + if (condition.length) { + query = { + 'conditions': [ + {'key': 'document', 'operator': '==', 'value': self.options.id}, + {'key': 'fulltext', 'operator': '=', 'value': condition[0].value} + ] + } + } + } + if (query) { + pandora.api.findPages({ + query: query, + range: [0, 100], + keys: ['page'] + }, function(result) { + renderPages(result.data.items) + }) + } else { + renderPages() + } + + function doubleclick(data) { + var $item, $target = $(data.target), annotation, item, points, set; + if ($target.parent().parent().is('.OxSpecialTarget')) { + $target = $target.parent().parent(); + } + if ($target.is('.OxSpecialTarget')) { + $item = $target.parent().parent(); + var page = $item.data('page') + pandora.URL.push(`/documents/${self.options.id}/${page}`); + } + } + + function singleclick(data) { + // .. + } + + return that; + +}; + diff --git a/static/js/documentsPanel.js b/static/js/documentsPanel.js index 2337352b5..73cbdf225 100644 --- a/static/js/documentsPanel.js +++ b/static/js/documentsPanel.js @@ -64,10 +64,7 @@ pandora.ui.documentSortSelect = function() { pandora.ui.documentViewSelect = function() { var ui = pandora.user.ui, that = Ox.Select({ - items: [ - {id: 'list', title: Ox._('View as List')}, - {id: 'grid', title: Ox._('View as Grid')} - ], + items: pandora.site.collectionViews, value: ui.documentsView, width: 128 }) diff --git a/static/js/mainMenu.js b/static/js/mainMenu.js index 30ead3652..dc7a0abe9 100644 --- a/static/js/mainMenu.js +++ b/static/js/mainMenu.js @@ -995,7 +995,7 @@ pandora.ui.mainMenu = function() { if (ui.document && i < 2) { pandora.UI.set({documentView: ['info', 'view'][i]}); } else if (i < 2) { - pandora.UI.set({collectionView: ['list', 'grid'][i]}); + pandora.UI.set({collectionView: ['list', 'grid', 'pages'][i]}); } } }); @@ -1498,7 +1498,7 @@ pandora.ui.mainMenu = function() { return [ { id: 'documents', title: Ox._('View Documents'), items: [ { group: 'collectionview', min: 1, max: 1, items: pandora.site.listViews.filter(function(view) { - return Ox.contains(['list', 'grid'], view.id) + return Ox.contains(['list', 'grid', 'pages'], view.id) }).map(function(view) { return Ox.extend({ checked: ui.collectionView == view.id diff --git a/static/js/pandora.js b/static/js/pandora.js index 8770be61c..96a6ece9d 100644 --- a/static/js/pandora.js +++ b/static/js/pandora.js @@ -419,11 +419,13 @@ appPanel ] }, sortKeys: pandora.getSortKeys(), - documentSortKeys: pandora.getDocumentSortKeys(), - collectionViews: [ - {id: 'list', title: Ox._('View as List')}, - {id: 'grid', title: Ox._('View as Grid')} - ] + documentSortKeys: pandora.getDocumentSortKeys() + }); + pandora.site.collectionViews = (pandora.site.collectionViews || [ + {id: 'list', title: 'as List'}, + {id: 'grid', title: 'as Grid'} + ]).map(view => { + return {id: view.id, title: Ox._('View {0}', [Ox._(view.title)])}; }); pandora.site.listSettings = {}; Ox.forEach(pandora.site.user.ui, function(val, key) {