extract detail from pdf

This commit is contained in:
j 2024-06-09 14:47:36 +01:00
parent fd34ba305c
commit 5bd561e64f
3 changed files with 75 additions and 5 deletions

View file

@ -93,6 +93,24 @@ class EpubHandler(OMLHandler):
self.set_header('Content-Type', content_type) self.set_header('Content-Type', content_type)
self.write(z.read(filename)) self.write(z.read(filename))
class CropHandler(OMLHandler):
def get(self, id, page, left, top, right, bottom):
from media.pdf import crop
with db.session():
item = Item.get(id)
path = item.get_path()
print(path, page, left, top, right, bottom)
data = crop(path, page, left, top, right, bottom)
if data:
self.set_header('Content-Type', 'image/jpeg')
self.set_header('Content-Length', str(len(data)))
self.write(data)
return
self.set_status(404)
return
def serve_static(handler, path, mimetype, include_body=True, disposition=None): def serve_static(handler, path, mimetype, include_body=True, disposition=None):
handler.set_header('Content-Type', mimetype) handler.set_header('Content-Type', mimetype)
size = os.stat(path).st_size size = os.stat(path).st_size

View file

@ -10,6 +10,7 @@ from glob import glob
from datetime import datetime from datetime import datetime
from PyPDF2 import PdfFileReader from PyPDF2 import PdfFileReader
from PIL import Image
import ox import ox
import settings import settings
@ -24,13 +25,13 @@ def cover(pdf):
else: else:
return page(pdf, 1) return page(pdf, 1)
def ql_cover(pdf): def ql_cover(pdf, size=1024):
tmp = tempfile.mkdtemp() tmp = tempfile.mkdtemp()
cmd = [ cmd = [
'qlmanage', 'qlmanage',
'-t', '-t',
'-s', '-s',
'1024', str(size),
'-o', '-o',
tmp, tmp,
pdf pdf
@ -48,7 +49,7 @@ def ql_cover(pdf):
shutil.rmtree(tmp) shutil.rmtree(tmp)
return data return data
def page(pdf, page): def page(pdf, page, size=1024):
tmp = tempfile.mkdtemp() tmp = tempfile.mkdtemp()
if sys.platform == 'win32': if sys.platform == 'win32':
pdf = get_short_path_name(pdf) pdf = get_short_path_name(pdf)
@ -57,7 +58,7 @@ def page(pdf, page):
pdf, pdf,
'-jpeg', '-jpeg',
'-f', str(page), '-l', str(page), '-f', str(page), '-l', str(page),
'-scale-to', '1024', '-cropbox', '-scale-to', str(size), '-cropbox',
os.path.join(tmp, 'page') os.path.join(tmp, 'page')
] ]
if sys.platform == 'win32': if sys.platform == 'win32':
@ -79,6 +80,47 @@ def page(pdf, page):
shutil.rmtree(tmp) shutil.rmtree(tmp)
return data return data
def crop(pdf, page, left, top, right, bottom):
size = 2048
tmp = tempfile.mkdtemp()
if sys.platform == 'win32':
pdf = get_short_path_name(pdf)
cmd = [
'pdftocairo',
pdf,
'-jpeg',
'-f', str(page), '-l', str(page),
'-scale-to', str(size), '-cropbox',
os.path.join(tmp, 'page')
]
if sys.platform == 'win32':
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
startupinfo.wShowWindow = subprocess.SW_HIDE
p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo)
else:
p = subprocess.Popen(cmd, close_fds=True)
p.wait()
image = glob('%s/*' % tmp)
if image:
image = image[0]
crop = [int(p) for p in (left, top, right, bottom)]
print(crop)
img = Image.open(image).crop(crop)
img.save(image)
with open(image, 'rb') as fd:
data = fd.read()
else:
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
data = None
shutil.rmtree(tmp)
return data
''' '''
def page(pdf, page): def page(pdf, page):
image = tempfile.mkstemp('.jpg')[1] image = tempfile.mkstemp('.jpg')[1]
@ -281,3 +323,4 @@ def extract_isbn(text):
isbns = find_isbns(text) isbns = find_isbns(text)
if isbns: if isbns:
return isbns[0] return isbns[0]

View file

@ -7,11 +7,13 @@ import signal
import time import time
from tornado.ioloop import IOLoop from tornado.ioloop import IOLoop
from tornado.web import StaticFileHandler, Application import tornado.web
from tornado.web import Application
from cache import Cache from cache import Cache
from item.handlers import EpubHandler, ReaderHandler, FileHandler from item.handlers import EpubHandler, ReaderHandler, FileHandler
from item.handlers import OMLHandler, UploadHandler from item.handlers import OMLHandler, UploadHandler
from item.handlers import CropHandler
from item.icons import IconHandler from item.icons import IconHandler
import db import db
import node.server import node.server
@ -29,6 +31,12 @@ import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class StaticFileHandler(tornado.web.StaticFileHandler):
def get_content_type(self):
if self.request.path.split('?')[0].endswith('.mjs'):
return 'application/javascript'
return super().get_content_type()
class MainHandler(OMLHandler): class MainHandler(OMLHandler):
def get(self, path): def get(self, path):
@ -126,6 +134,7 @@ def run():
(r'/(.*?)/get/', FileHandler, { (r'/(.*?)/get/', FileHandler, {
'attachment': True 'attachment': True
}), }),
(r'/(.*)/2048p(\d*),(\d*),(\d*),(\d*),(\d*).jpg', CropHandler),
(r'/(.*)/(cover|preview)(\d*).jpg', IconHandler), (r'/(.*)/(cover|preview)(\d*).jpg', IconHandler),
] ]
handlers = common_handlers + [ handlers = common_handlers + [