From 95c08e929e5d29aed2a53da28cc923b5bc0a3a77 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 15 Nov 2019 15:21:29 +0100 Subject: [PATCH 1/4] show remaining document keys --- static/js/documentInfoView.js | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/static/js/documentInfoView.js b/static/js/documentInfoView.js index 9b3cd79e..c9d65fd2 100644 --- a/static/js/documentInfoView.js +++ b/static/js/documentInfoView.js @@ -28,6 +28,13 @@ pandora.ui.documentInfoView = function(data, isMixed) { }).map(function(key){ return key.id; }), + displayedKeys = [ // FIXME: can tis be a flag in the config? + 'title', 'notes', 'name', 'description', 'id', + 'user', 'rightslevel', 'timesaccessed', + 'extension', 'dimensions', 'size', 'matches', + 'created', 'modified', 'accessed', + 'random', 'entity' + ], statisticsWidth = 128, $bar = Ox.Bar({size: 16}) @@ -234,6 +241,10 @@ pandora.ui.documentInfoView = function(data, isMixed) { Ox.getObjectById(pandora.site.documentKeys, 'keywords') && renderGroup(['keywords']) + // Render any remaing keys defined in config + + renderRemainingKeys(); + // Description ------------------------------------------------------------- @@ -321,6 +332,7 @@ pandora.ui.documentInfoView = function(data, isMixed) { } + // Extension, Dimensions, Size --------------------------------------------- ['extension', 'dimensions', 'size'].forEach(function(key) { @@ -533,6 +545,7 @@ pandora.ui.documentInfoView = function(data, isMixed) { function renderGroup(keys) { var $element; + keys.forEach(function(key) { displayedKeys.push(key) }); if (canEdit || keys.filter(function(key) { return data[key]; }).length) { @@ -565,6 +578,17 @@ pandora.ui.documentInfoView = function(data, isMixed) { return $element; } + function renderRemainingKeys() { + var keys = pandora.site.documentKeys.filter(function(item) { + return item.id != '*' && !Ox.contains(displayedKeys, item.id); + }).map(function(item) { + return item.id; + }); + if (keys.length) { + renderGroup(keys) + } + } + function renderRightsLevel() { var $rightsLevelElement = getRightsLevelElement(data.rightslevel), $rightsLevelSelect; From e06b263237b6b6c1e635f79963b25e87bb4c32b6 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 15 Nov 2019 16:44:43 +0100 Subject: [PATCH 2/4] always enable default filters --- pandora/app/config.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandora/app/config.py b/pandora/app/config.py index 3a9dd36d..bbe15994 100644 --- a/pandora/app/config.py +++ b/pandora/app/config.py @@ -71,7 +71,7 @@ def load_config(init=False): if getattr(settings, 'SITEURL', False): config['site']['url'] = settings.SITEURL settings.URL = config['site']['url'] - settings.EMAIL_SUBJECT_PREFIX = '[%s]'%settings.SITENAME + settings.EMAIL_SUBJECT_PREFIX = '[%s]' % settings.SITENAME settings.DEFAULT_FROM_EMAIL = config['site']['email']['system'] settings.SERVER_EMAIL = config['site']['email']['system'] config['site']['videoprefix'] = settings.VIDEO_PREFIX @@ -79,9 +79,9 @@ def load_config(init=False): config['site']['googleapikey'] = getattr(settings, 'GOOGLE_API_KEY') config['site']['version'] = get_version() config['site']['dontValidateUser'] = not settings.AUTH_CHECK_USERNAME - if not 'folderdepth' in config['site']: + if 'folderdepth' not in config['site']: config['site']['folderdepth'] = settings.USE_IMDB and 4 or 3 - if 'sendReferrer' in config and not 'sendReferrer' in config['site']: + if 'sendReferrer' in config and 'sendReferrer' not in config['site']: config['site']['sendReferrer'] = config.pop('sendReferrer') # enable default filters if needed @@ -91,6 +91,13 @@ def load_config(init=False): key['filter'] = True sys.stderr.write('enabled filter for "%s" since its used as default filter.\n' % (key['id'])) + # enable default document filters if needed + default_filters = [f['id'] for f in config['user']['ui']['documentFilters']] + for key in config['documentKeys']: + if key['id'] in default_filters and not key.get('filter'): + key['filter'] = True + sys.stderr.write('enabled filter for documeny key "%s" since its used as default filter.\n' % (key['id'])) + config['keys'] = {} for key in config['itemKeys']: config['keys'][key['id']] = key From f8c1c3e328e69f478618c24080877ea3c6b84cec Mon Sep 17 00:00:00 2001 From: j Date: Sat, 16 Nov 2019 00:18:06 +0100 Subject: [PATCH 3/4] reuse lookup --- static/js/documentFilter.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/documentFilter.js b/static/js/documentFilter.js index 960eca09..ab7f39fe 100644 --- a/static/js/documentFilter.js +++ b/static/js/documentFilter.js @@ -4,7 +4,7 @@ pandora.ui.documentFilter = function(id) { var i = Ox.getIndexById(pandora.user.ui.documentFilters, id), filter = Ox.getObjectById(pandora.site.documentFilters, id), panelWidth = Ox.$document.width() - (pandora.user.ui.showSidebar * pandora.user.ui.sidebarSize) - 1, - title = Ox._(Ox.getObjectById(pandora.site.documentFilters, id).title), + title = Ox._(filter.title), //width = pandora.getFilterWidth(i, panelWidth), that = Ox.TableList({ _selected: !pandora.user.ui.showFilters From fe023c2f974631fae41eee2ae8ec3efd83930971 Mon Sep 17 00:00:00 2001 From: j Date: Sun, 17 Nov 2019 13:02:12 +0100 Subject: [PATCH 4/4] fulltext search for documents optional fulltext search for documents using elasticsearch text is extracted from pdfs and via ocr from images --- pandora/document/fulltext.py | 85 ++++++++++++++++++++++++++++++++++++ pandora/document/managers.py | 6 +++ pandora/document/models.py | 6 ++- pandora/settings.py | 3 ++ requirements.txt | 1 + vm/pandora_install.sh | 3 ++ 6 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 pandora/document/fulltext.py diff --git a/pandora/document/fulltext.py b/pandora/document/fulltext.py new file mode 100644 index 00000000..3990b63d --- /dev/null +++ b/pandora/document/fulltext.py @@ -0,0 +1,85 @@ +import subprocess + +from django.conf import settings + + +def extract_text(pdf): + cmd = ['pdftotext', pdf, '-'] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + stdout = stdout.decode() + return stdout.strip() + +def ocr_image(path): + cmd = ['tesseract', path, '-', 'txt'] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + stdout = stdout.decode() + return stdout.strip() + +class FulltextMixin: + _ES_INDEX = "document-index" + + @classmethod + def elasticsearch(cls): + from elasticsearch import Elasticsearch + es = Elasticsearch(settings.ELASTICSEARCH_HOST) + return es + + def extract_fulltext(self): + if self.extension == 'pdf': + return extract_text(self.file.path) + elif self.extension in ('png', 'jpg'): + return ocr_image(self.file.path) + elif self.extension == 'html': + return self.data.get('text', '') + return '' + + def delete_fulltext(self): + res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type='document', id=self.id) + + def update_fulltext(self): + text = self.extract_fulltext() + if text: + doc = { + 'text': text.lower() + } + res = self.elasticsearch().index(index=self._ES_INDEX, doc_type='document', id=self.id, body=doc) + + @classmethod + def find_fulltext(cls, query): + ids = cls.find_fulltext_ids(query) + return cls.objects.filter(id__in=ids) + + @classmethod + def find_fulltext_ids(cls, query): + if query[0] == '"' and query[-1] == '"': + query = { + "match_phrase": { + "text": query.lower()[1:-1] + }, + } + else: + query = { + "match": { + "text": { + "query": query.lower(), + "operator": "and" + } + } + } + ids = [] + res = None + from_ = 0 + es = cls.elasticsearch() + while not res or len(ids) < res['hits']['total']['value']: + res = es.search(index=cls._ES_INDEX, body={ + "from": from_, + "_source": False, + "query": query + }) + if not res['hits']['hits']: + break + ids += [int(r['_id']) for r in res['hits']['hits']] + from_ += len(res['hits']['hits']) + return ids diff --git a/pandora/document/managers.py b/pandora/document/managers.py index 7c97b0f1..b55afeea 100644 --- a/pandora/document/managers.py +++ b/pandora/document/managers.py @@ -128,6 +128,12 @@ def buildCondition(k, op, v, user, exclude=False, owner=None): else: q = Q(id=0) return q + elif key_type == 'fulltext': + qs = models.Document.find_fulltext_ids(v) + q = Q(id__in=qs) + if exclude: + q = ~Q(id__in=qs) + return q elif key_type == 'boolean': q = Q(**{'find__key': k, 'find__value': v}) if exclude: diff --git a/pandora/document/models.py b/pandora/document/models.py index 81fd4dd1..a436360a 100644 --- a/pandora/document/models.py +++ b/pandora/document/models.py @@ -30,6 +30,7 @@ from user.models import Group from . import managers from . import utils +from .fulltext import FulltextMixin User = get_user_model() @@ -40,7 +41,7 @@ def get_path(f, x): return f.path(x) @python_2_unicode_compatible -class Document(models.Model): +class Document(models.Model, FulltextMixin): created = models.DateTimeField(auto_now_add=True) modified = models.DateTimeField(auto_now=True) @@ -153,7 +154,7 @@ class Document(models.Model): i = key['id'] if i == 'rightslevel': save(i, self.rightslevel) - elif i not in ('*', 'dimensions') and i not in self.facet_keys: + elif i not in ('*', 'dimensions', 'fulltext') and i not in self.facet_keys: value = data.get(i) if isinstance(value, list): value = u'\n'.join(value) @@ -277,6 +278,7 @@ class Document(models.Model): self.update_sort() self.update_find() self.update_facets() + self.update_fulltext() new = False else: new = True diff --git a/pandora/settings.py b/pandora/settings.py index 1e8f1d56..e4f46698 100644 --- a/pandora/settings.py +++ b/pandora/settings.py @@ -204,6 +204,9 @@ CELERY_BROKER_URL = 'amqp://pandora:box@localhost:5672//pandora' SEND_CELERY_ERROR_EMAILS = False +# Elasticsearch +ELASTICSEARCH_HOST = None + #with apache x-sendfile or lighttpd set this to True XSENDFILE = False diff --git a/requirements.txt b/requirements.txt index d26e9bd9..ba40eaae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ tornado<5 geoip2==2.9.0 youtube-dl>=2019.4.30 python-memcached +elasticsearch diff --git a/vm/pandora_install.sh b/vm/pandora_install.sh index 3a62e90d..1c635773 100755 --- a/vm/pandora_install.sh +++ b/vm/pandora_install.sh @@ -91,6 +91,7 @@ apt-get install -y \ python3-lxml \ python3-html5lib \ python3-ox \ + python3-elasticsearch \ oxframe \ ffmpeg \ mkvtoolnix \ @@ -98,6 +99,8 @@ apt-get install -y \ imagemagick \ poppler-utils \ ipython3 \ + tesseract-ocr \ + tesseract-ocr-eng \ postfix \ postgresql-client $EXTRA