minimal support for txt documents

This commit is contained in:
j 2025-07-06 21:40:31 +01:00
commit 94d57028cd
10 changed files with 761 additions and 5 deletions

View file

@ -59,6 +59,12 @@ class FulltextMixin:
return extract_text(self.file.path)
elif self.extension == 'epub':
return epub.extract_text(self.file.path)
elif self.extension == 'txt':
data = ''
if os.path.exists(self.file.path):
with open(self.file.path) as fd:
data = fd.read()
return data
elif self.extension in IMAGE_EXTENSIONS:
return ocr_image(self.file.path)
elif self.extension in CONVERT_EXTENSIONS:
@ -191,6 +197,12 @@ class FulltextPageMixin(FulltextMixin):
elif self.extension == 'epub':
# FIXME: is there a nice way to split that into pages
return epub.extract_text(self.file.path)
elif self.extension == 'txt':
data = ''
if os.path.exists(self.file.path):
with open(self.file.path) as fd:
data = fd.read()
return data
elif self.extension in IMAGE_EXTENSIONS:
return ocr_image(self.document.file.path)
elif self.extension == 'html':

View file

@ -31,6 +31,7 @@ from . import managers
from . import utils
from . import tasks
from . import epub
from . import txt
from .fulltext import FulltextMixin, FulltextPageMixin
User = get_user_model()
@ -178,6 +179,9 @@ class Document(models.Model, FulltextMixin):
elif self.extension == 'epub':
prefix = 3
value = self.pages
elif self.extension == 'txt':
prefix = 4
value = self.pages
elif self.extension == 'html':
prefix = 1
value = self.dimensions
@ -393,7 +397,7 @@ class Document(models.Model, FulltextMixin):
@property
def dimensions(self):
if self.extension in ('pdf', 'epub'):
if self.extension in ('pdf', 'epub', 'txt'):
return self.pages
elif self.extension == 'html':
return len(self.data.get('text', '').split(' '))
@ -574,6 +578,10 @@ class Document(models.Model, FulltextMixin):
if data:
with open(path, "wb") as fd:
fd.write(data)
elif self.extension == 'txt':
path = os.path.join(folder, '1024.jpg')
if os.path.exists(src) and not os.path.exists(path):
txt.render(src, path)
elif self.extension in ('jpg', 'png', 'gif', 'webp', 'heic', 'heif', 'cr2'):
if os.path.exists(src):
if size and page:
@ -622,19 +630,22 @@ class Document(models.Model, FulltextMixin):
if thumb:
self.width, self.height = open_image_rgb(thumb).size
self.pages = 1
elif self.extension == 'txt':
thumb = self.thumbnail(1024)
if thumb:
self.width, self.height = open_image_rgb(thumb).size
self.pages = 1
elif self.width == -1:
self.pages = -1
self.width, self.height = open_image_rgb(self.file.path).size
def get_ratio(self):
if self.extension in ('pdf', 'epub'):
if self.extension in ('pdf', 'epub', 'txt'):
image = self.thumbnail(1024)
try:
size = Image.open(image).size
except:
size = [1, 1]
elif self.extension == 'epub':
size = [1, 1]
else:
if self.width > 0:
size = self.resolution

71
pandora/document/txt.py Executable file
View file

@ -0,0 +1,71 @@
import os
from PIL import Image
from argparse import ArgumentParser
from ox.image import drawText, wrapText
from django.conf import settings
def decode_line(line):
try:
line = line.decode('utf-8')
except:
try:
line = line.decode('latin-1')
except:
line = line.decode('utf-8', errors='replace')
return line
def render(infile, outfile):
with open(infile, 'rb') as f:
image_size = (768, 1024)
margin = 64
offset = margin
font_file = settings.TXT_TTF
font_size = 24
line_height = 32
max_lines = (image_size[1] - 2 * margin) / line_height
image = Image.new('L', image_size, (255))
for line in f:
line = decode_line(line)
for line_ in line.strip().split('\r'):
lines = wrapText(
line_,
image_size[0] - 2 * margin,
# we don't want the last line that ends with an ellipsis
max_lines + 1,
font_file,
font_size
)
for line__ in lines:
drawText(
image,
(margin, offset),
line__,
font_file,
font_size,
(0)
)
offset += line_height
max_lines -= 1
if max_lines == 0:
break
if max_lines == 0:
break
if max_lines == 0:
break
image.save(outfile, quality=50)

View file

@ -313,6 +313,9 @@ EMPTY_CLIPS = True
YT_DLP_EXTRA = []
TXT_TTF = "/usr/share/fonts/truetype/msttcorefonts/Georgia.ttf"
TXT_TTF = "/usr/share/fonts/truetype/noto/NotoSansMono-Regular.ttf"
#you can ignore things below this line
#=========================================================================
LOCAL_APPS = []