From 5bd561e64fa88e61d89fb3beab0925f7bdcd5f13 Mon Sep 17 00:00:00 2001 From: j Date: Sun, 9 Jun 2024 14:47:36 +0100 Subject: [PATCH] extract detail from pdf --- oml/item/handlers.py | 18 ++++++++++++++++ oml/media/pdf.py | 51 ++++++++++++++++++++++++++++++++++++++++---- oml/server.py | 11 +++++++++- 3 files changed, 75 insertions(+), 5 deletions(-) diff --git a/oml/item/handlers.py b/oml/item/handlers.py index b20001a..8f892b5 100644 --- a/oml/item/handlers.py +++ b/oml/item/handlers.py @@ -93,6 +93,24 @@ class EpubHandler(OMLHandler): self.set_header('Content-Type', content_type) self.write(z.read(filename)) +class CropHandler(OMLHandler): + + def get(self, id, page, left, top, right, bottom): + from media.pdf import crop + with db.session(): + item = Item.get(id) + path = item.get_path() + print(path, page, left, top, right, bottom) + data = crop(path, page, left, top, right, bottom) + if data: + self.set_header('Content-Type', 'image/jpeg') + self.set_header('Content-Length', str(len(data))) + self.write(data) + return + self.set_status(404) + return + + def serve_static(handler, path, mimetype, include_body=True, disposition=None): handler.set_header('Content-Type', mimetype) size = os.stat(path).st_size diff --git a/oml/media/pdf.py b/oml/media/pdf.py index 90e76cf..96d27ed 100644 --- a/oml/media/pdf.py +++ b/oml/media/pdf.py @@ -10,6 +10,7 @@ from glob import glob from datetime import datetime from PyPDF2 import PdfFileReader +from PIL import Image import ox import settings @@ -24,13 +25,13 @@ def cover(pdf): else: return page(pdf, 1) -def ql_cover(pdf): +def ql_cover(pdf, size=1024): tmp = tempfile.mkdtemp() cmd = [ 'qlmanage', '-t', '-s', - '1024', + str(size), '-o', tmp, pdf @@ -48,7 +49,7 @@ def ql_cover(pdf): shutil.rmtree(tmp) return data -def page(pdf, page): +def page(pdf, page, size=1024): tmp = tempfile.mkdtemp() if sys.platform == 'win32': pdf = get_short_path_name(pdf) @@ -57,7 +58,7 @@ def page(pdf, page): pdf, '-jpeg', '-f', str(page), '-l', str(page), - '-scale-to', '1024', '-cropbox', + '-scale-to', str(size), '-cropbox', os.path.join(tmp, 'page') ] if sys.platform == 'win32': @@ -79,6 +80,47 @@ def page(pdf, page): shutil.rmtree(tmp) return data +def crop(pdf, page, left, top, right, bottom): + size = 2048 + tmp = tempfile.mkdtemp() + if sys.platform == 'win32': + pdf = get_short_path_name(pdf) + cmd = [ + 'pdftocairo', + pdf, + '-jpeg', + '-f', str(page), '-l', str(page), + '-scale-to', str(size), '-cropbox', + os.path.join(tmp, 'page') + ] + if sys.platform == 'win32': + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + startupinfo.wShowWindow = subprocess.SW_HIDE + p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo) + else: + p = subprocess.Popen(cmd, close_fds=True) + p.wait() + image = glob('%s/*' % tmp) + if image: + image = image[0] + crop = [int(p) for p in (left, top, right, bottom)] + print(crop) + img = Image.open(image).crop(crop) + img.save(image) + with open(image, 'rb') as fd: + data = fd.read() + else: + logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd)) + data = None + shutil.rmtree(tmp) + return data + + + + + + ''' def page(pdf, page): image = tempfile.mkstemp('.jpg')[1] @@ -281,3 +323,4 @@ def extract_isbn(text): isbns = find_isbns(text) if isbns: return isbns[0] + diff --git a/oml/server.py b/oml/server.py index a557054..1993ae1 100644 --- a/oml/server.py +++ b/oml/server.py @@ -7,11 +7,13 @@ import signal import time from tornado.ioloop import IOLoop -from tornado.web import StaticFileHandler, Application +import tornado.web +from tornado.web import Application from cache import Cache from item.handlers import EpubHandler, ReaderHandler, FileHandler from item.handlers import OMLHandler, UploadHandler +from item.handlers import CropHandler from item.icons import IconHandler import db import node.server @@ -29,6 +31,12 @@ import logging logger = logging.getLogger(__name__) +class StaticFileHandler(tornado.web.StaticFileHandler): + def get_content_type(self): + if self.request.path.split('?')[0].endswith('.mjs'): + return 'application/javascript' + return super().get_content_type() + class MainHandler(OMLHandler): def get(self, path): @@ -126,6 +134,7 @@ def run(): (r'/(.*?)/get/', FileHandler, { 'attachment': True }), + (r'/(.*)/2048p(\d*),(\d*),(\d*),(\d*),(\d*).jpg', CropHandler), (r'/(.*)/(cover|preview)(\d*).jpg', IconHandler), ] handlers = common_handlers + [