Compare commits

..

4 commits

Author SHA1 Message Date
j
fe023c2f97 fulltext search for documents
optional fulltext search for documents using elasticsearch
text is extracted from pdfs and via ocr from images
2019-11-17 13:02:12 +01:00
j
f8c1c3e328 reuse lookup 2019-11-16 00:18:06 +01:00
j
e06b263237 always enable default filters 2019-11-15 16:44:43 +01:00
j
95c08e929e show remaining document keys 2019-11-15 15:21:29 +01:00
9 changed files with 137 additions and 6 deletions

View file

@ -71,7 +71,7 @@ def load_config(init=False):
if getattr(settings, 'SITEURL', False): if getattr(settings, 'SITEURL', False):
config['site']['url'] = settings.SITEURL config['site']['url'] = settings.SITEURL
settings.URL = config['site']['url'] settings.URL = config['site']['url']
settings.EMAIL_SUBJECT_PREFIX = '[%s]'%settings.SITENAME settings.EMAIL_SUBJECT_PREFIX = '[%s]' % settings.SITENAME
settings.DEFAULT_FROM_EMAIL = config['site']['email']['system'] settings.DEFAULT_FROM_EMAIL = config['site']['email']['system']
settings.SERVER_EMAIL = config['site']['email']['system'] settings.SERVER_EMAIL = config['site']['email']['system']
config['site']['videoprefix'] = settings.VIDEO_PREFIX config['site']['videoprefix'] = settings.VIDEO_PREFIX
@ -79,9 +79,9 @@ def load_config(init=False):
config['site']['googleapikey'] = getattr(settings, 'GOOGLE_API_KEY') config['site']['googleapikey'] = getattr(settings, 'GOOGLE_API_KEY')
config['site']['version'] = get_version() config['site']['version'] = get_version()
config['site']['dontValidateUser'] = not settings.AUTH_CHECK_USERNAME config['site']['dontValidateUser'] = not settings.AUTH_CHECK_USERNAME
if not 'folderdepth' in config['site']: if 'folderdepth' not in config['site']:
config['site']['folderdepth'] = settings.USE_IMDB and 4 or 3 config['site']['folderdepth'] = settings.USE_IMDB and 4 or 3
if 'sendReferrer' in config and not 'sendReferrer' in config['site']: if 'sendReferrer' in config and 'sendReferrer' not in config['site']:
config['site']['sendReferrer'] = config.pop('sendReferrer') config['site']['sendReferrer'] = config.pop('sendReferrer')
# enable default filters if needed # enable default filters if needed
@ -91,6 +91,13 @@ def load_config(init=False):
key['filter'] = True key['filter'] = True
sys.stderr.write('enabled filter for "%s" since its used as default filter.\n' % (key['id'])) sys.stderr.write('enabled filter for "%s" since its used as default filter.\n' % (key['id']))
# enable default document filters if needed
default_filters = [f['id'] for f in config['user']['ui']['documentFilters']]
for key in config['documentKeys']:
if key['id'] in default_filters and not key.get('filter'):
key['filter'] = True
sys.stderr.write('enabled filter for documeny key "%s" since its used as default filter.\n' % (key['id']))
config['keys'] = {} config['keys'] = {}
for key in config['itemKeys']: for key in config['itemKeys']:
config['keys'][key['id']] = key config['keys'][key['id']] = key

View file

@ -0,0 +1,85 @@
import subprocess
from django.conf import settings
def extract_text(pdf):
cmd = ['pdftotext', pdf, '-']
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
stdout = stdout.decode()
return stdout.strip()
def ocr_image(path):
cmd = ['tesseract', path, '-', 'txt']
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
stdout = stdout.decode()
return stdout.strip()
class FulltextMixin:
_ES_INDEX = "document-index"
@classmethod
def elasticsearch(cls):
from elasticsearch import Elasticsearch
es = Elasticsearch(settings.ELASTICSEARCH_HOST)
return es
def extract_fulltext(self):
if self.extension == 'pdf':
return extract_text(self.file.path)
elif self.extension in ('png', 'jpg'):
return ocr_image(self.file.path)
elif self.extension == 'html':
return self.data.get('text', '')
return ''
def delete_fulltext(self):
res = self.elasticsearch().delete(index=self._ES_INDEX, doc_type='document', id=self.id)
def update_fulltext(self):
text = self.extract_fulltext()
if text:
doc = {
'text': text.lower()
}
res = self.elasticsearch().index(index=self._ES_INDEX, doc_type='document', id=self.id, body=doc)
@classmethod
def find_fulltext(cls, query):
ids = cls.find_fulltext_ids(query)
return cls.objects.filter(id__in=ids)
@classmethod
def find_fulltext_ids(cls, query):
if query[0] == '"' and query[-1] == '"':
query = {
"match_phrase": {
"text": query.lower()[1:-1]
},
}
else:
query = {
"match": {
"text": {
"query": query.lower(),
"operator": "and"
}
}
}
ids = []
res = None
from_ = 0
es = cls.elasticsearch()
while not res or len(ids) < res['hits']['total']['value']:
res = es.search(index=cls._ES_INDEX, body={
"from": from_,
"_source": False,
"query": query
})
if not res['hits']['hits']:
break
ids += [int(r['_id']) for r in res['hits']['hits']]
from_ += len(res['hits']['hits'])
return ids

View file

@ -128,6 +128,12 @@ def buildCondition(k, op, v, user, exclude=False, owner=None):
else: else:
q = Q(id=0) q = Q(id=0)
return q return q
elif key_type == 'fulltext':
qs = models.Document.find_fulltext_ids(v)
q = Q(id__in=qs)
if exclude:
q = ~Q(id__in=qs)
return q
elif key_type == 'boolean': elif key_type == 'boolean':
q = Q(**{'find__key': k, 'find__value': v}) q = Q(**{'find__key': k, 'find__value': v})
if exclude: if exclude:

View file

@ -30,6 +30,7 @@ from user.models import Group
from . import managers from . import managers
from . import utils from . import utils
from .fulltext import FulltextMixin
User = get_user_model() User = get_user_model()
@ -40,7 +41,7 @@ def get_path(f, x):
return f.path(x) return f.path(x)
@python_2_unicode_compatible @python_2_unicode_compatible
class Document(models.Model): class Document(models.Model, FulltextMixin):
created = models.DateTimeField(auto_now_add=True) created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True) modified = models.DateTimeField(auto_now=True)
@ -153,7 +154,7 @@ class Document(models.Model):
i = key['id'] i = key['id']
if i == 'rightslevel': if i == 'rightslevel':
save(i, self.rightslevel) save(i, self.rightslevel)
elif i not in ('*', 'dimensions') and i not in self.facet_keys: elif i not in ('*', 'dimensions', 'fulltext') and i not in self.facet_keys:
value = data.get(i) value = data.get(i)
if isinstance(value, list): if isinstance(value, list):
value = u'\n'.join(value) value = u'\n'.join(value)
@ -277,6 +278,7 @@ class Document(models.Model):
self.update_sort() self.update_sort()
self.update_find() self.update_find()
self.update_facets() self.update_facets()
self.update_fulltext()
new = False new = False
else: else:
new = True new = True

View file

@ -204,6 +204,9 @@ CELERY_BROKER_URL = 'amqp://pandora:box@localhost:5672//pandora'
SEND_CELERY_ERROR_EMAILS = False SEND_CELERY_ERROR_EMAILS = False
# Elasticsearch
ELASTICSEARCH_HOST = None
#with apache x-sendfile or lighttpd set this to True #with apache x-sendfile or lighttpd set this to True
XSENDFILE = False XSENDFILE = False

View file

@ -11,3 +11,4 @@ tornado<5
geoip2==2.9.0 geoip2==2.9.0
youtube-dl>=2019.4.30 youtube-dl>=2019.4.30
python-memcached python-memcached
elasticsearch

View file

@ -4,7 +4,7 @@ pandora.ui.documentFilter = function(id) {
var i = Ox.getIndexById(pandora.user.ui.documentFilters, id), var i = Ox.getIndexById(pandora.user.ui.documentFilters, id),
filter = Ox.getObjectById(pandora.site.documentFilters, id), filter = Ox.getObjectById(pandora.site.documentFilters, id),
panelWidth = Ox.$document.width() - (pandora.user.ui.showSidebar * pandora.user.ui.sidebarSize) - 1, panelWidth = Ox.$document.width() - (pandora.user.ui.showSidebar * pandora.user.ui.sidebarSize) - 1,
title = Ox._(Ox.getObjectById(pandora.site.documentFilters, id).title), title = Ox._(filter.title),
//width = pandora.getFilterWidth(i, panelWidth), //width = pandora.getFilterWidth(i, panelWidth),
that = Ox.TableList({ that = Ox.TableList({
_selected: !pandora.user.ui.showFilters _selected: !pandora.user.ui.showFilters

View file

@ -28,6 +28,13 @@ pandora.ui.documentInfoView = function(data, isMixed) {
}).map(function(key){ }).map(function(key){
return key.id; return key.id;
}), }),
displayedKeys = [ // FIXME: can tis be a flag in the config?
'title', 'notes', 'name', 'description', 'id',
'user', 'rightslevel', 'timesaccessed',
'extension', 'dimensions', 'size', 'matches',
'created', 'modified', 'accessed',
'random', 'entity'
],
statisticsWidth = 128, statisticsWidth = 128,
$bar = Ox.Bar({size: 16}) $bar = Ox.Bar({size: 16})
@ -234,6 +241,10 @@ pandora.ui.documentInfoView = function(data, isMixed) {
Ox.getObjectById(pandora.site.documentKeys, 'keywords') && renderGroup(['keywords']) Ox.getObjectById(pandora.site.documentKeys, 'keywords') && renderGroup(['keywords'])
// Render any remaing keys defined in config
renderRemainingKeys();
// Description ------------------------------------------------------------- // Description -------------------------------------------------------------
@ -321,6 +332,7 @@ pandora.ui.documentInfoView = function(data, isMixed) {
} }
// Extension, Dimensions, Size --------------------------------------------- // Extension, Dimensions, Size ---------------------------------------------
['extension', 'dimensions', 'size'].forEach(function(key) { ['extension', 'dimensions', 'size'].forEach(function(key) {
@ -533,6 +545,7 @@ pandora.ui.documentInfoView = function(data, isMixed) {
function renderGroup(keys) { function renderGroup(keys) {
var $element; var $element;
keys.forEach(function(key) { displayedKeys.push(key) });
if (canEdit || keys.filter(function(key) { if (canEdit || keys.filter(function(key) {
return data[key]; return data[key];
}).length) { }).length) {
@ -565,6 +578,17 @@ pandora.ui.documentInfoView = function(data, isMixed) {
return $element; return $element;
} }
function renderRemainingKeys() {
var keys = pandora.site.documentKeys.filter(function(item) {
return item.id != '*' && !Ox.contains(displayedKeys, item.id);
}).map(function(item) {
return item.id;
});
if (keys.length) {
renderGroup(keys)
}
}
function renderRightsLevel() { function renderRightsLevel() {
var $rightsLevelElement = getRightsLevelElement(data.rightslevel), var $rightsLevelElement = getRightsLevelElement(data.rightslevel),
$rightsLevelSelect; $rightsLevelSelect;

View file

@ -91,6 +91,7 @@ apt-get install -y \
python3-lxml \ python3-lxml \
python3-html5lib \ python3-html5lib \
python3-ox \ python3-ox \
python3-elasticsearch \
oxframe \ oxframe \
ffmpeg \ ffmpeg \
mkvtoolnix \ mkvtoolnix \
@ -98,6 +99,8 @@ apt-get install -y \
imagemagick \ imagemagick \
poppler-utils \ poppler-utils \
ipython3 \ ipython3 \
tesseract-ocr \
tesseract-ocr-eng \
postfix \ postfix \
postgresql-client $EXTRA postgresql-client $EXTRA