extract detail from pdf

This commit is contained in:
j 2024-06-09 14:47:36 +01:00
parent fd34ba305c
commit 5bd561e64f
3 changed files with 75 additions and 5 deletions

View file

@ -93,6 +93,24 @@ class EpubHandler(OMLHandler):
self.set_header('Content-Type', content_type)
self.write(z.read(filename))
class CropHandler(OMLHandler):
def get(self, id, page, left, top, right, bottom):
from media.pdf import crop
with db.session():
item = Item.get(id)
path = item.get_path()
print(path, page, left, top, right, bottom)
data = crop(path, page, left, top, right, bottom)
if data:
self.set_header('Content-Type', 'image/jpeg')
self.set_header('Content-Length', str(len(data)))
self.write(data)
return
self.set_status(404)
return
def serve_static(handler, path, mimetype, include_body=True, disposition=None):
handler.set_header('Content-Type', mimetype)
size = os.stat(path).st_size

View file

@ -10,6 +10,7 @@ from glob import glob
from datetime import datetime
from PyPDF2 import PdfFileReader
from PIL import Image
import ox
import settings
@ -24,13 +25,13 @@ def cover(pdf):
else:
return page(pdf, 1)
def ql_cover(pdf):
def ql_cover(pdf, size=1024):
tmp = tempfile.mkdtemp()
cmd = [
'qlmanage',
'-t',
'-s',
'1024',
str(size),
'-o',
tmp,
pdf
@ -48,7 +49,7 @@ def ql_cover(pdf):
shutil.rmtree(tmp)
return data
def page(pdf, page):
def page(pdf, page, size=1024):
tmp = tempfile.mkdtemp()
if sys.platform == 'win32':
pdf = get_short_path_name(pdf)
@ -57,7 +58,7 @@ def page(pdf, page):
pdf,
'-jpeg',
'-f', str(page), '-l', str(page),
'-scale-to', '1024', '-cropbox',
'-scale-to', str(size), '-cropbox',
os.path.join(tmp, 'page')
]
if sys.platform == 'win32':
@ -79,6 +80,47 @@ def page(pdf, page):
shutil.rmtree(tmp)
return data
def crop(pdf, page, left, top, right, bottom):
size = 2048
tmp = tempfile.mkdtemp()
if sys.platform == 'win32':
pdf = get_short_path_name(pdf)
cmd = [
'pdftocairo',
pdf,
'-jpeg',
'-f', str(page), '-l', str(page),
'-scale-to', str(size), '-cropbox',
os.path.join(tmp, 'page')
]
if sys.platform == 'win32':
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
startupinfo.wShowWindow = subprocess.SW_HIDE
p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo)
else:
p = subprocess.Popen(cmd, close_fds=True)
p.wait()
image = glob('%s/*' % tmp)
if image:
image = image[0]
crop = [int(p) for p in (left, top, right, bottom)]
print(crop)
img = Image.open(image).crop(crop)
img.save(image)
with open(image, 'rb') as fd:
data = fd.read()
else:
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
data = None
shutil.rmtree(tmp)
return data
'''
def page(pdf, page):
image = tempfile.mkstemp('.jpg')[1]
@ -281,3 +323,4 @@ def extract_isbn(text):
isbns = find_isbns(text)
if isbns:
return isbns[0]

View file

@ -7,11 +7,13 @@ import signal
import time
from tornado.ioloop import IOLoop
from tornado.web import StaticFileHandler, Application
import tornado.web
from tornado.web import Application
from cache import Cache
from item.handlers import EpubHandler, ReaderHandler, FileHandler
from item.handlers import OMLHandler, UploadHandler
from item.handlers import CropHandler
from item.icons import IconHandler
import db
import node.server
@ -29,6 +31,12 @@ import logging
logger = logging.getLogger(__name__)
class StaticFileHandler(tornado.web.StaticFileHandler):
def get_content_type(self):
if self.request.path.split('?')[0].endswith('.mjs'):
return 'application/javascript'
return super().get_content_type()
class MainHandler(OMLHandler):
def get(self, path):
@ -126,6 +134,7 @@ def run():
(r'/(.*?)/get/', FileHandler, {
'attachment': True
}),
(r'/(.*)/2048p(\d*),(\d*),(\d*),(\d*),(\d*).jpg', CropHandler),
(r'/(.*)/(cover|preview)(\d*).jpg', IconHandler),
]
handlers = common_handlers + [