minimal support for txt documents
This commit is contained in:
parent
3c69c0c101
commit
94d57028cd
10 changed files with 761 additions and 5 deletions
|
|
@ -59,6 +59,12 @@ class FulltextMixin:
|
|||
return extract_text(self.file.path)
|
||||
elif self.extension == 'epub':
|
||||
return epub.extract_text(self.file.path)
|
||||
elif self.extension == 'txt':
|
||||
data = ''
|
||||
if os.path.exists(self.file.path):
|
||||
with open(self.file.path) as fd:
|
||||
data = fd.read()
|
||||
return data
|
||||
elif self.extension in IMAGE_EXTENSIONS:
|
||||
return ocr_image(self.file.path)
|
||||
elif self.extension in CONVERT_EXTENSIONS:
|
||||
|
|
@ -191,6 +197,12 @@ class FulltextPageMixin(FulltextMixin):
|
|||
elif self.extension == 'epub':
|
||||
# FIXME: is there a nice way to split that into pages
|
||||
return epub.extract_text(self.file.path)
|
||||
elif self.extension == 'txt':
|
||||
data = ''
|
||||
if os.path.exists(self.file.path):
|
||||
with open(self.file.path) as fd:
|
||||
data = fd.read()
|
||||
return data
|
||||
elif self.extension in IMAGE_EXTENSIONS:
|
||||
return ocr_image(self.document.file.path)
|
||||
elif self.extension == 'html':
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ from . import managers
|
|||
from . import utils
|
||||
from . import tasks
|
||||
from . import epub
|
||||
from . import txt
|
||||
from .fulltext import FulltextMixin, FulltextPageMixin
|
||||
|
||||
User = get_user_model()
|
||||
|
|
@ -178,6 +179,9 @@ class Document(models.Model, FulltextMixin):
|
|||
elif self.extension == 'epub':
|
||||
prefix = 3
|
||||
value = self.pages
|
||||
elif self.extension == 'txt':
|
||||
prefix = 4
|
||||
value = self.pages
|
||||
elif self.extension == 'html':
|
||||
prefix = 1
|
||||
value = self.dimensions
|
||||
|
|
@ -393,7 +397,7 @@ class Document(models.Model, FulltextMixin):
|
|||
|
||||
@property
|
||||
def dimensions(self):
|
||||
if self.extension in ('pdf', 'epub'):
|
||||
if self.extension in ('pdf', 'epub', 'txt'):
|
||||
return self.pages
|
||||
elif self.extension == 'html':
|
||||
return len(self.data.get('text', '').split(' '))
|
||||
|
|
@ -574,6 +578,10 @@ class Document(models.Model, FulltextMixin):
|
|||
if data:
|
||||
with open(path, "wb") as fd:
|
||||
fd.write(data)
|
||||
elif self.extension == 'txt':
|
||||
path = os.path.join(folder, '1024.jpg')
|
||||
if os.path.exists(src) and not os.path.exists(path):
|
||||
txt.render(src, path)
|
||||
elif self.extension in ('jpg', 'png', 'gif', 'webp', 'heic', 'heif', 'cr2'):
|
||||
if os.path.exists(src):
|
||||
if size and page:
|
||||
|
|
@ -622,19 +630,22 @@ class Document(models.Model, FulltextMixin):
|
|||
if thumb:
|
||||
self.width, self.height = open_image_rgb(thumb).size
|
||||
self.pages = 1
|
||||
elif self.extension == 'txt':
|
||||
thumb = self.thumbnail(1024)
|
||||
if thumb:
|
||||
self.width, self.height = open_image_rgb(thumb).size
|
||||
self.pages = 1
|
||||
elif self.width == -1:
|
||||
self.pages = -1
|
||||
self.width, self.height = open_image_rgb(self.file.path).size
|
||||
|
||||
def get_ratio(self):
|
||||
if self.extension in ('pdf', 'epub'):
|
||||
if self.extension in ('pdf', 'epub', 'txt'):
|
||||
image = self.thumbnail(1024)
|
||||
try:
|
||||
size = Image.open(image).size
|
||||
except:
|
||||
size = [1, 1]
|
||||
elif self.extension == 'epub':
|
||||
size = [1, 1]
|
||||
else:
|
||||
if self.width > 0:
|
||||
size = self.resolution
|
||||
|
|
|
|||
71
pandora/document/txt.py
Executable file
71
pandora/document/txt.py
Executable file
|
|
@ -0,0 +1,71 @@
|
|||
import os
|
||||
|
||||
from PIL import Image
|
||||
from argparse import ArgumentParser
|
||||
from ox.image import drawText, wrapText
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
def decode_line(line):
|
||||
try:
|
||||
line = line.decode('utf-8')
|
||||
except:
|
||||
try:
|
||||
line = line.decode('latin-1')
|
||||
except:
|
||||
line = line.decode('utf-8', errors='replace')
|
||||
return line
|
||||
|
||||
def render(infile, outfile):
|
||||
|
||||
with open(infile, 'rb') as f:
|
||||
|
||||
image_size = (768, 1024)
|
||||
margin = 64
|
||||
offset = margin
|
||||
font_file = settings.TXT_TTF
|
||||
font_size = 24
|
||||
line_height = 32
|
||||
max_lines = (image_size[1] - 2 * margin) / line_height
|
||||
|
||||
image = Image.new('L', image_size, (255))
|
||||
|
||||
for line in f:
|
||||
line = decode_line(line)
|
||||
|
||||
for line_ in line.strip().split('\r'):
|
||||
|
||||
lines = wrapText(
|
||||
line_,
|
||||
image_size[0] - 2 * margin,
|
||||
# we don't want the last line that ends with an ellipsis
|
||||
max_lines + 1,
|
||||
font_file,
|
||||
font_size
|
||||
)
|
||||
|
||||
for line__ in lines:
|
||||
drawText(
|
||||
image,
|
||||
(margin, offset),
|
||||
line__,
|
||||
font_file,
|
||||
font_size,
|
||||
(0)
|
||||
)
|
||||
offset += line_height
|
||||
max_lines -= 1
|
||||
|
||||
if max_lines == 0:
|
||||
break
|
||||
|
||||
if max_lines == 0:
|
||||
break
|
||||
|
||||
if max_lines == 0:
|
||||
break
|
||||
|
||||
image.save(outfile, quality=50)
|
||||
|
||||
|
||||
|
|
@ -313,6 +313,9 @@ EMPTY_CLIPS = True
|
|||
|
||||
YT_DLP_EXTRA = []
|
||||
|
||||
TXT_TTF = "/usr/share/fonts/truetype/msttcorefonts/Georgia.ttf"
|
||||
TXT_TTF = "/usr/share/fonts/truetype/noto/NotoSansMono-Regular.ttf"
|
||||
|
||||
#you can ignore things below this line
|
||||
#=========================================================================
|
||||
LOCAL_APPS = []
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue