From fe023c2f974631fae41eee2ae8ec3efd83930971 Mon Sep 17 00:00:00 2001 From: j Date: Sun, 17 Nov 2019 13:02:12 +0100 Subject: [PATCH] fulltext search for documents optional fulltext search for documents using elasticsearch text is extracted from pdfs and via ocr from images --- pandora/document/fulltext.py | 85 ++++++++++++++++++++++++++++++++++++ pandora/document/managers.py | 6 +++ pandora/document/models.py | 6 ++- pandora/settings.py | 3 ++ requirements.txt | 1 + vm/pandora_install.sh | 3 ++ 6 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 pandora/document/fulltext.py diff --git a/pandora/document/fulltext.py b/pandora/document/fulltext.py new file mode 100644 index 00000000..3990b63d --- /dev/null +++ b/pandora/document/fulltext.py @@ -0,0 +1,85 @@ +import subprocess + +from django.conf import settings + + +def extract_text(pdf): + cmd = ['pdftotext', pdf, '-'] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + stdout = stdout.decode() + return stdout.strip() + +def ocr_image(path): + cmd = ['tesseract', path, '-', 'txt'] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + stdout = stdout.decode() + return stdout.strip() + +class FulltextMixin: + _ES_INDEX = "document-index" + + @classmethod + def elasticsearch(cls): + from elasticsearch import Elasticsearch + es = Elasticsearch(settings.ELASTICSEARCH_HOST) + return es + + def extract_fulltext(self): + if self.extension == 'pdf': + return extract_text(self.file.path) + elif self.extension in ('png', 'jpg'): + return ocr_image(self.file.path) + elif self.extension == 'html': + return self.data.get('text', '') + return '' + + def delete_fulltext(self): + res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type='document', id=self.id) + + def update_fulltext(self): + text = self.extract_fulltext() + if text: + doc = { + 'text': text.lower() + } + res = self.elasticsearch().index(index=self._ES_INDEX, doc_type='document', id=self.id, body=doc) + + @classmethod + def find_fulltext(cls, query): + ids = cls.find_fulltext_ids(query) + return cls.objects.filter(id__in=ids) + + @classmethod + def find_fulltext_ids(cls, query): + if query[0] == '"' and query[-1] == '"': + query = { + "match_phrase": { + "text": query.lower()[1:-1] + }, + } + else: + query = { + "match": { + "text": { + "query": query.lower(), + "operator": "and" + } + } + } + ids = [] + res = None + from_ = 0 + es = cls.elasticsearch() + while not res or len(ids) < res['hits']['total']['value']: + res = es.search(index=cls._ES_INDEX, body={ + "from": from_, + "_source": False, + "query": query + }) + if not res['hits']['hits']: + break + ids += [int(r['_id']) for r in res['hits']['hits']] + from_ += len(res['hits']['hits']) + return ids diff --git a/pandora/document/managers.py b/pandora/document/managers.py index 7c97b0f1..b55afeea 100644 --- a/pandora/document/managers.py +++ b/pandora/document/managers.py @@ -128,6 +128,12 @@ def buildCondition(k, op, v, user, exclude=False, owner=None): else: q = Q(id=0) return q + elif key_type == 'fulltext': + qs = models.Document.find_fulltext_ids(v) + q = Q(id__in=qs) + if exclude: + q = ~Q(id__in=qs) + return q elif key_type == 'boolean': q = Q(**{'find__key': k, 'find__value': v}) if exclude: diff --git a/pandora/document/models.py b/pandora/document/models.py index 81fd4dd1..a436360a 100644 --- a/pandora/document/models.py +++ b/pandora/document/models.py @@ -30,6 +30,7 @@ from user.models import Group from . import managers from . import utils +from .fulltext import FulltextMixin User = get_user_model() @@ -40,7 +41,7 @@ def get_path(f, x): return f.path(x) @python_2_unicode_compatible -class Document(models.Model): +class Document(models.Model, FulltextMixin): created = models.DateTimeField(auto_now_add=True) modified = models.DateTimeField(auto_now=True) @@ -153,7 +154,7 @@ class Document(models.Model): i = key['id'] if i == 'rightslevel': save(i, self.rightslevel) - elif i not in ('*', 'dimensions') and i not in self.facet_keys: + elif i not in ('*', 'dimensions', 'fulltext') and i not in self.facet_keys: value = data.get(i) if isinstance(value, list): value = u'\n'.join(value) @@ -277,6 +278,7 @@ class Document(models.Model): self.update_sort() self.update_find() self.update_facets() + self.update_fulltext() new = False else: new = True diff --git a/pandora/settings.py b/pandora/settings.py index 1e8f1d56..e4f46698 100644 --- a/pandora/settings.py +++ b/pandora/settings.py @@ -204,6 +204,9 @@ CELERY_BROKER_URL = 'amqp://pandora:box@localhost:5672//pandora' SEND_CELERY_ERROR_EMAILS = False +# Elasticsearch +ELASTICSEARCH_HOST = None + #with apache x-sendfile or lighttpd set this to True XSENDFILE = False diff --git a/requirements.txt b/requirements.txt index d26e9bd9..ba40eaae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ tornado<5 geoip2==2.9.0 youtube-dl>=2019.4.30 python-memcached +elasticsearch diff --git a/vm/pandora_install.sh b/vm/pandora_install.sh index 3a62e90d..1c635773 100755 --- a/vm/pandora_install.sh +++ b/vm/pandora_install.sh @@ -91,6 +91,7 @@ apt-get install -y \ python3-lxml \ python3-html5lib \ python3-ox \ + python3-elasticsearch \ oxframe \ ffmpeg \ mkvtoolnix \ @@ -98,6 +99,8 @@ apt-get install -y \ imagemagick \ poppler-utils \ ipython3 \ + tesseract-ocr \ + tesseract-ocr-eng \ postfix \ postgresql-client $EXTRA