fulltext search for documents

optional fulltext search for documents using elasticsearch text is extracted from pdfs and via ocr from images
2019-11-17 13:02:12 +01:00 · 2019-11-17 13:02:12 +01:00 · fe023c2f97
commit fe023c2f97
parent f8c1c3e328
6 changed files with 102 additions and 2 deletions
--- a/pandora/document/fulltext.py
+++ b/pandora/document/fulltext.py
@ -0,0 +1,85 @@
 import subprocess
 from django.conf import settings
 def extract_text(pdf):
    cmd = ['pdftotext', pdf, '-']
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    stdout = stdout.decode()
    return stdout.strip()
 def ocr_image(path):
    cmd = ['tesseract', path, '-', 'txt']
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    stdout = stdout.decode()
    return stdout.strip()
 class FulltextMixin:
    _ES_INDEX = "document-index"
    @classmethod
    def elasticsearch(cls):
        from elasticsearch import Elasticsearch
        es = Elasticsearch(settings.ELASTICSEARCH_HOST)
        return es
    def extract_fulltext(self):
        if self.extension == 'pdf':
            return extract_text(self.file.path)
        elif self.extension in ('png', 'jpg'):
            return ocr_image(self.file.path)
        elif self.extension == 'html':
            return self.data.get('text', '')
        return ''
    def delete_fulltext(self):
        res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type='document', id=self.id)
    def update_fulltext(self):
        text = self.extract_fulltext()
        if text:
            doc = {
                'text': text.lower()
            }
            res = self.elasticsearch().index(index=self._ES_INDEX, doc_type='document', id=self.id, body=doc)
    @classmethod
    def find_fulltext(cls, query):
        ids = cls.find_fulltext_ids(query)
        return cls.objects.filter(id__in=ids)
    @classmethod
    def find_fulltext_ids(cls, query):
        if query[0] == '"' and query[-1] == '"':
            query = {
                "match_phrase": {
                    "text": query.lower()[1:-1]
                },
            }
        else:
            query = {
                "match": {
                    "text": {
                        "query": query.lower(),
                        "operator": "and"
                    }
                }
            }
        ids = []
        res = None
        from_ = 0
        es = cls.elasticsearch()
        while not res or len(ids) < res['hits']['total']['value']:
            res = es.search(index=cls._ES_INDEX, body={
                "from": from_,
                "_source": False,
                "query": query
            })
            if not res['hits']['hits']:
                break
            ids += [int(r['_id']) for r in res['hits']['hits']]
            from_ += len(res['hits']['hits'])
        return ids
--- a/pandora/document/managers.py
+++ b/pandora/document/managers.py
@ -128,6 +128,12 @@ def buildCondition(k, op, v, user, exclude=False, owner=None):
            else:
                q = Q(id=0)
        return q
    elif key_type == 'fulltext':
        qs = models.Document.find_fulltext_ids(v)
        q = Q(id__in=qs)
        if exclude:
            q = ~Q(id__in=qs)
        return q
    elif key_type == 'boolean':
        q = Q(**{'find__key': k, 'find__value': v})
        if exclude:
--- a/pandora/document/models.py
+++ b/pandora/document/models.py
@ -30,6 +30,7 @@ from user.models import Group
 from . import managers
 from . import utils
 from .fulltext import FulltextMixin
 User = get_user_model()
@ -40,7 +41,7 @@ def get_path(f, x):
    return f.path(x)
@python_2_unicode_compatible
-class Document(models.Model):
+class Document(models.Model, FulltextMixin):
    created = models.DateTimeField(auto_now_add=True)
    modified = models.DateTimeField(auto_now=True)
@ -153,7 +154,7 @@ class Document(models.Model):
                i = key['id']
                if i == 'rightslevel':
                    save(i, self.rightslevel)
-                elif i not in ('*', 'dimensions') and i not in self.facet_keys:
+                elif i not in ('*', 'dimensions', 'fulltext') and i not in self.facet_keys:
                    value = data.get(i)
                    if isinstance(value, list):
                        value = u'\n'.join(value)
@ -277,6 +278,7 @@ class Document(models.Model):
                self.update_sort()
                self.update_find()
                self.update_facets()
                self.update_fulltext()
            new = False
        else:
            new = True
--- a/pandora/settings.py
+++ b/pandora/settings.py
@ -204,6 +204,9 @@ CELERY_BROKER_URL = 'amqp://pandora:box@localhost:5672//pandora'
 SEND_CELERY_ERROR_EMAILS = False
 # Elasticsearch
 ELASTICSEARCH_HOST = None
 #with apache x-sendfile or lighttpd set this to True
 XSENDFILE = False
--- a/requirements.txt
+++ b/requirements.txt
@ -11,3 +11,4 @@ tornado<5
 geoip2==2.9.0
 youtube-dl>=2019.4.30
 python-memcached
 elasticsearch
--- a/vm/pandora_install.sh
+++ b/vm/pandora_install.sh
@ -91,6 +91,7 @@ apt-get install -y \
    python3-lxml \
    python3-html5lib \
    python3-ox \
    python3-elasticsearch \
    oxframe \
    ffmpeg \
    mkvtoolnix \
@ -98,6 +99,8 @@ apt-get install -y \
    imagemagick \
    poppler-utils \
    ipython3 \
    tesseract-ocr \
    tesseract-ocr-eng \
    postfix \
    postgresql-client $EXTRA