fulltext search for documents

optional fulltext search for documents using elasticsearch text is extracted from pdfs and via ocr from images
2019-11-17 13:02:12 +01:00 · 2019-11-17 13:02:12 +01:00 · fe023c2f97
commit fe023c2f97
parent f8c1c3e328
6 changed files with 102 additions and 2 deletions
--- a/pandora/document/fulltext.py
+++ b/pandora/document/fulltext.py
@ -0,0 +1,85 @@
+import subprocess
+
+from django.conf import settings
+
+
+def extract_text(pdf):
+    cmd = ['pdftotext', pdf, '-']
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = p.communicate()
+    stdout = stdout.decode()
+    return stdout.strip()
+
+def ocr_image(path):
+    cmd = ['tesseract', path, '-', 'txt']
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = p.communicate()
+    stdout = stdout.decode()
+    return stdout.strip()
+
+class FulltextMixin:
+    _ES_INDEX = "document-index"
+
+    @classmethod
+    def elasticsearch(cls):
+        from elasticsearch import Elasticsearch
+        es = Elasticsearch(settings.ELASTICSEARCH_HOST)
+        return es
+
+    def extract_fulltext(self):
+        if self.extension == 'pdf':
+            return extract_text(self.file.path)
+        elif self.extension in ('png', 'jpg'):
+            return ocr_image(self.file.path)
+        elif self.extension == 'html':
+            return self.data.get('text', '')
+        return ''
+
+    def delete_fulltext(self):
+        res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type='document', id=self.id)
+
+    def update_fulltext(self):
+        text = self.extract_fulltext()
+        if text:
+            doc = {
+                'text': text.lower()
+            }
+            res = self.elasticsearch().index(index=self._ES_INDEX, doc_type='document', id=self.id, body=doc)
+
+    @classmethod
+    def find_fulltext(cls, query):
+        ids = cls.find_fulltext_ids(query)
+        return cls.objects.filter(id__in=ids)
+
+    @classmethod
+    def find_fulltext_ids(cls, query):
+        if query[0] == '"' and query[-1] == '"':
+            query = {
+                "match_phrase": {
+                    "text": query.lower()[1:-1]
+                },
+            }
+        else:
+            query = {
+                "match": {
+                    "text": {
+                        "query": query.lower(),
+                        "operator": "and"
+                    }
+                }
+            }
+        ids = []
+        res = None
+        from_ = 0
+        es = cls.elasticsearch()
+        while not res or len(ids) < res['hits']['total']['value']:
+            res = es.search(index=cls._ES_INDEX, body={
+                "from": from_,
+                "_source": False,
+                "query": query
+            })
+            if not res['hits']['hits']:
+                break
+            ids += [int(r['_id']) for r in res['hits']['hits']]
+            from_ += len(res['hits']['hits'])
+        return ids