From fe023c2f974631fae41eee2ae8ec3efd83930971 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Sun, 17 Nov 2019 13:02:12 +0100
Subject: [PATCH] fulltext search for documents

optional fulltext search for documents using elasticsearch
text is extracted from pdfs and via ocr from images
---
 pandora/document/fulltext.py | 85 ++++++++++++++++++++++++++++++++++++
 pandora/document/managers.py |  6 +++
 pandora/document/models.py   |  6 ++-
 pandora/settings.py          |  3 ++
 requirements.txt             |  1 +
 vm/pandora_install.sh        |  3 ++
 6 files changed, 102 insertions(+), 2 deletions(-)
 create mode 100644 pandora/document/fulltext.py

diff --git a/pandora/document/fulltext.py b/pandora/document/fulltext.py
new file mode 100644
index 00000000..3990b63d
--- /dev/null
+++ b/pandora/document/fulltext.py
@@ -0,0 +1,85 @@
+import subprocess
+
+from django.conf import settings
+
+
+def extract_text(pdf):
+    cmd = ['pdftotext', pdf, '-']
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = p.communicate()
+    stdout = stdout.decode()
+    return stdout.strip()
+
+def ocr_image(path):
+    cmd = ['tesseract', path, '-', 'txt']
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = p.communicate()
+    stdout = stdout.decode()
+    return stdout.strip()
+
+class FulltextMixin:
+    _ES_INDEX = "document-index"
+
+    @classmethod
+    def elasticsearch(cls):
+        from elasticsearch import Elasticsearch
+        es = Elasticsearch(settings.ELASTICSEARCH_HOST)
+        return es
+
+    def extract_fulltext(self):
+        if self.extension == 'pdf':
+            return extract_text(self.file.path)
+        elif self.extension in ('png', 'jpg'):
+            return ocr_image(self.file.path)
+        elif self.extension == 'html':
+            return self.data.get('text', '')
+        return ''
+
+    def delete_fulltext(self):
+        res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type='document', id=self.id)
+
+    def update_fulltext(self):
+        text = self.extract_fulltext()
+        if text:
+            doc = {
+                'text': text.lower()
+            }
+            res = self.elasticsearch().index(index=self._ES_INDEX, doc_type='document', id=self.id, body=doc)
+
+    @classmethod
+    def find_fulltext(cls, query):
+        ids = cls.find_fulltext_ids(query)
+        return cls.objects.filter(id__in=ids)
+
+    @classmethod
+    def find_fulltext_ids(cls, query):
+        if query[0] == '"' and query[-1] == '"':
+            query = {
+                "match_phrase": {
+                    "text": query.lower()[1:-1]
+                },
+            }
+        else:
+            query = {
+                "match": {
+                    "text": {
+                        "query": query.lower(),
+                        "operator": "and"
+                    }
+                }
+            }
+        ids = []
+        res = None
+        from_ = 0
+        es = cls.elasticsearch()
+        while not res or len(ids) < res['hits']['total']['value']:
+            res = es.search(index=cls._ES_INDEX, body={
+                "from": from_,
+                "_source": False,
+                "query": query
+            })
+            if not res['hits']['hits']:
+                break
+            ids += [int(r['_id']) for r in res['hits']['hits']]
+            from_ += len(res['hits']['hits'])
+        return ids
diff --git a/pandora/document/managers.py b/pandora/document/managers.py
index 7c97b0f1..b55afeea 100644
--- a/pandora/document/managers.py
+++ b/pandora/document/managers.py
@@ -128,6 +128,12 @@ def buildCondition(k, op, v, user, exclude=False, owner=None):
             else:
                 q = Q(id=0)
         return q
+    elif key_type == 'fulltext':
+        qs = models.Document.find_fulltext_ids(v)
+        q = Q(id__in=qs)
+        if exclude:
+            q = ~Q(id__in=qs)
+        return q
     elif key_type == 'boolean':
         q = Q(**{'find__key': k, 'find__value': v})
         if exclude:
diff --git a/pandora/document/models.py b/pandora/document/models.py
index 81fd4dd1..a436360a 100644
--- a/pandora/document/models.py
+++ b/pandora/document/models.py
@@ -30,6 +30,7 @@ from user.models import Group
 
 from . import managers
 from . import utils
+from .fulltext import FulltextMixin
 
 User = get_user_model()
 
@@ -40,7 +41,7 @@ def get_path(f, x):
     return f.path(x)
 
 @python_2_unicode_compatible
-class Document(models.Model):
+class Document(models.Model, FulltextMixin):
 
     created = models.DateTimeField(auto_now_add=True)
     modified = models.DateTimeField(auto_now=True)
@@ -153,7 +154,7 @@ class Document(models.Model):
                 i = key['id']
                 if i == 'rightslevel':
                     save(i, self.rightslevel)
-                elif i not in ('*', 'dimensions') and i not in self.facet_keys:
+                elif i not in ('*', 'dimensions', 'fulltext') and i not in self.facet_keys:
                     value = data.get(i)
                     if isinstance(value, list):
                         value = u'\n'.join(value)
@@ -277,6 +278,7 @@ class Document(models.Model):
                 self.update_sort()
                 self.update_find()
                 self.update_facets()
+                self.update_fulltext()
             new = False
         else:
             new = True
diff --git a/pandora/settings.py b/pandora/settings.py
index 1e8f1d56..e4f46698 100644
--- a/pandora/settings.py
+++ b/pandora/settings.py
@@ -204,6 +204,9 @@ CELERY_BROKER_URL = 'amqp://pandora:box@localhost:5672//pandora'
 
 SEND_CELERY_ERROR_EMAILS = False
 
+# Elasticsearch
+ELASTICSEARCH_HOST = None
+
 #with apache x-sendfile or lighttpd set this to True
 XSENDFILE = False
 
diff --git a/requirements.txt b/requirements.txt
index d26e9bd9..ba40eaae 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ tornado<5
 geoip2==2.9.0
 youtube-dl>=2019.4.30
 python-memcached
+elasticsearch
diff --git a/vm/pandora_install.sh b/vm/pandora_install.sh
index 3a62e90d..1c635773 100755
--- a/vm/pandora_install.sh
+++ b/vm/pandora_install.sh
@@ -91,6 +91,7 @@ apt-get install -y \
     python3-lxml \
     python3-html5lib \
     python3-ox \
+    python3-elasticsearch \
     oxframe \
     ffmpeg \
     mkvtoolnix \
@@ -98,6 +99,8 @@ apt-get install -y \
     imagemagick \
     poppler-utils \
     ipython3 \
+    tesseract-ocr \
+    tesseract-ocr-eng \
     postfix \
     postgresql-client $EXTRA