minimal support for txt documents

2025-07-06 21:40:31 +01:00 · 2025-07-06 21:40:31 +01:00 · 94d57028cd
commit 94d57028cd
parent 3c69c0c101
10 changed files with 761 additions and 5 deletions
--- a/pandora/document/fulltext.py
+++ b/pandora/document/fulltext.py
@ -59,6 +59,12 @@ class FulltextMixin:
                return extract_text(self.file.path)
            elif self.extension == 'epub':
                return epub.extract_text(self.file.path)
+            elif self.extension == 'txt':
+                data = ''
+                if os.path.exists(self.file.path):
+                    with open(self.file.path) as fd:
+                        data = fd.read()
+                return data
            elif self.extension in IMAGE_EXTENSIONS:
                return ocr_image(self.file.path)
            elif self.extension in CONVERT_EXTENSIONS:
@ -191,6 +197,12 @@ class FulltextPageMixin(FulltextMixin):
            elif self.extension == 'epub':
                # FIXME: is there a nice way to split that into pages
                return epub.extract_text(self.file.path)
+            elif self.extension == 'txt':
+                data = ''
+                if os.path.exists(self.file.path):
+                    with open(self.file.path) as fd:
+                        data = fd.read()
+                return data
            elif self.extension in IMAGE_EXTENSIONS:
                return ocr_image(self.document.file.path)
        elif self.extension == 'html':
--- a/pandora/document/models.py
+++ b/pandora/document/models.py
@ -31,6 +31,7 @@ from . import managers
 from . import utils
 from . import tasks
 from . import epub
+from . import txt
 from .fulltext import FulltextMixin, FulltextPageMixin

 User = get_user_model()
@ -178,6 +179,9 @@ class Document(models.Model, FulltextMixin):
        elif self.extension == 'epub':
            prefix = 3
            value = self.pages
+        elif self.extension == 'txt':
+            prefix = 4
+            value = self.pages
        elif self.extension == 'html':
            prefix = 1
            value = self.dimensions
@ -393,7 +397,7 @@ class Document(models.Model, FulltextMixin):

    @property
    def dimensions(self):
-        if self.extension in ('pdf', 'epub'):
+        if self.extension in ('pdf', 'epub', 'txt'):
            return self.pages
        elif self.extension == 'html':
            return len(self.data.get('text', '').split(' '))
@ -574,6 +578,10 @@ class Document(models.Model, FulltextMixin):
                if data:
                    with open(path, "wb") as fd:
                        fd.write(data)
+        elif self.extension == 'txt':
+            path = os.path.join(folder, '1024.jpg')
+            if os.path.exists(src) and not os.path.exists(path):
+                txt.render(src, path)
        elif self.extension in ('jpg', 'png', 'gif', 'webp', 'heic', 'heif', 'cr2'):
            if os.path.exists(src):
                if size and page:
@ -622,19 +630,22 @@ class Document(models.Model, FulltextMixin):
            if thumb:
                self.width, self.height = open_image_rgb(thumb).size
                self.pages = 1
+        elif self.extension == 'txt':
+            thumb = self.thumbnail(1024)
+            if thumb:
+                self.width, self.height = open_image_rgb(thumb).size
+                self.pages = 1
        elif self.width == -1:
            self.pages = -1
            self.width, self.height = open_image_rgb(self.file.path).size

    def get_ratio(self):
-        if self.extension in ('pdf', 'epub'):
+        if self.extension in ('pdf', 'epub', 'txt'):
            image = self.thumbnail(1024)
            try:
                size = Image.open(image).size
            except:
                size = [1, 1]
-        elif self.extension == 'epub':
-            size = [1, 1]
        else:
            if self.width > 0:
                size = self.resolution
--- a/pandora/document/txt.py
+++ b/pandora/document/txt.py
@ -0,0 +1,71 @@
+import os
+
+from PIL import Image
+from argparse import ArgumentParser
+from ox.image import drawText, wrapText
+
+from django.conf import settings
+
+
+def decode_line(line):
+    try:
+        line = line.decode('utf-8')
+    except:
+        try:
+            line = line.decode('latin-1')
+        except:
+            line = line.decode('utf-8', errors='replace')
+    return line
+
+def render(infile, outfile):
+
+    with open(infile, 'rb') as f:
+
+        image_size = (768, 1024)
+        margin = 64
+        offset = margin
+        font_file = settings.TXT_TTF
+        font_size = 24
+        line_height = 32
+        max_lines = (image_size[1] - 2 * margin) / line_height
+
+        image = Image.new('L', image_size, (255))
+
+        for line in f:
+            line = decode_line(line)
+
+            for line_ in line.strip().split('\r'):
+
+                lines = wrapText(
+                    line_,
+                    image_size[0] - 2 * margin,
+                    # we don't want the last line that ends with an ellipsis
+                    max_lines + 1,
+                    font_file,
+                    font_size
+                )
+
+                for line__ in lines:
+                    drawText(
+                        image,
+                        (margin, offset),
+                        line__,
+                        font_file,
+                        font_size,
+                        (0)
+                    )
+                    offset += line_height
+                    max_lines -= 1
+
+                    if max_lines == 0:
+                        break
+
+                if max_lines == 0:
+                    break
+
+            if max_lines == 0:
+                break
+
+        image.save(outfile, quality=50)
+
+
--- a/pandora/settings.py
+++ b/pandora/settings.py
@ -313,6 +313,9 @@ EMPTY_CLIPS = True

 YT_DLP_EXTRA = []

+TXT_TTF = "/usr/share/fonts/truetype/msttcorefonts/Georgia.ttf"
+TXT_TTF = "/usr/share/fonts/truetype/noto/NotoSansMono-Regular.ttf"
+
 #you can ignore things below this line
 #=========================================================================
 LOCAL_APPS = []