extract detail from pdf
This commit is contained in:
parent
fd34ba305c
commit
5bd561e64f
3 changed files with 75 additions and 5 deletions
|
@ -93,6 +93,24 @@ class EpubHandler(OMLHandler):
|
||||||
self.set_header('Content-Type', content_type)
|
self.set_header('Content-Type', content_type)
|
||||||
self.write(z.read(filename))
|
self.write(z.read(filename))
|
||||||
|
|
||||||
|
class CropHandler(OMLHandler):
|
||||||
|
|
||||||
|
def get(self, id, page, left, top, right, bottom):
|
||||||
|
from media.pdf import crop
|
||||||
|
with db.session():
|
||||||
|
item = Item.get(id)
|
||||||
|
path = item.get_path()
|
||||||
|
print(path, page, left, top, right, bottom)
|
||||||
|
data = crop(path, page, left, top, right, bottom)
|
||||||
|
if data:
|
||||||
|
self.set_header('Content-Type', 'image/jpeg')
|
||||||
|
self.set_header('Content-Length', str(len(data)))
|
||||||
|
self.write(data)
|
||||||
|
return
|
||||||
|
self.set_status(404)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
def serve_static(handler, path, mimetype, include_body=True, disposition=None):
|
def serve_static(handler, path, mimetype, include_body=True, disposition=None):
|
||||||
handler.set_header('Content-Type', mimetype)
|
handler.set_header('Content-Type', mimetype)
|
||||||
size = os.stat(path).st_size
|
size = os.stat(path).st_size
|
||||||
|
|
|
@ -10,6 +10,7 @@ from glob import glob
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from PyPDF2 import PdfFileReader
|
from PyPDF2 import PdfFileReader
|
||||||
|
from PIL import Image
|
||||||
import ox
|
import ox
|
||||||
|
|
||||||
import settings
|
import settings
|
||||||
|
@ -24,13 +25,13 @@ def cover(pdf):
|
||||||
else:
|
else:
|
||||||
return page(pdf, 1)
|
return page(pdf, 1)
|
||||||
|
|
||||||
def ql_cover(pdf):
|
def ql_cover(pdf, size=1024):
|
||||||
tmp = tempfile.mkdtemp()
|
tmp = tempfile.mkdtemp()
|
||||||
cmd = [
|
cmd = [
|
||||||
'qlmanage',
|
'qlmanage',
|
||||||
'-t',
|
'-t',
|
||||||
'-s',
|
'-s',
|
||||||
'1024',
|
str(size),
|
||||||
'-o',
|
'-o',
|
||||||
tmp,
|
tmp,
|
||||||
pdf
|
pdf
|
||||||
|
@ -48,7 +49,7 @@ def ql_cover(pdf):
|
||||||
shutil.rmtree(tmp)
|
shutil.rmtree(tmp)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def page(pdf, page):
|
def page(pdf, page, size=1024):
|
||||||
tmp = tempfile.mkdtemp()
|
tmp = tempfile.mkdtemp()
|
||||||
if sys.platform == 'win32':
|
if sys.platform == 'win32':
|
||||||
pdf = get_short_path_name(pdf)
|
pdf = get_short_path_name(pdf)
|
||||||
|
@ -57,7 +58,7 @@ def page(pdf, page):
|
||||||
pdf,
|
pdf,
|
||||||
'-jpeg',
|
'-jpeg',
|
||||||
'-f', str(page), '-l', str(page),
|
'-f', str(page), '-l', str(page),
|
||||||
'-scale-to', '1024', '-cropbox',
|
'-scale-to', str(size), '-cropbox',
|
||||||
os.path.join(tmp, 'page')
|
os.path.join(tmp, 'page')
|
||||||
]
|
]
|
||||||
if sys.platform == 'win32':
|
if sys.platform == 'win32':
|
||||||
|
@ -79,6 +80,47 @@ def page(pdf, page):
|
||||||
shutil.rmtree(tmp)
|
shutil.rmtree(tmp)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def crop(pdf, page, left, top, right, bottom):
|
||||||
|
size = 2048
|
||||||
|
tmp = tempfile.mkdtemp()
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
pdf = get_short_path_name(pdf)
|
||||||
|
cmd = [
|
||||||
|
'pdftocairo',
|
||||||
|
pdf,
|
||||||
|
'-jpeg',
|
||||||
|
'-f', str(page), '-l', str(page),
|
||||||
|
'-scale-to', str(size), '-cropbox',
|
||||||
|
os.path.join(tmp, 'page')
|
||||||
|
]
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
startupinfo = subprocess.STARTUPINFO()
|
||||||
|
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
|
||||||
|
startupinfo.wShowWindow = subprocess.SW_HIDE
|
||||||
|
p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo)
|
||||||
|
else:
|
||||||
|
p = subprocess.Popen(cmd, close_fds=True)
|
||||||
|
p.wait()
|
||||||
|
image = glob('%s/*' % tmp)
|
||||||
|
if image:
|
||||||
|
image = image[0]
|
||||||
|
crop = [int(p) for p in (left, top, right, bottom)]
|
||||||
|
print(crop)
|
||||||
|
img = Image.open(image).crop(crop)
|
||||||
|
img.save(image)
|
||||||
|
with open(image, 'rb') as fd:
|
||||||
|
data = fd.read()
|
||||||
|
else:
|
||||||
|
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
|
||||||
|
data = None
|
||||||
|
shutil.rmtree(tmp)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
def page(pdf, page):
|
def page(pdf, page):
|
||||||
image = tempfile.mkstemp('.jpg')[1]
|
image = tempfile.mkstemp('.jpg')[1]
|
||||||
|
@ -281,3 +323,4 @@ def extract_isbn(text):
|
||||||
isbns = find_isbns(text)
|
isbns = find_isbns(text)
|
||||||
if isbns:
|
if isbns:
|
||||||
return isbns[0]
|
return isbns[0]
|
||||||
|
|
||||||
|
|
|
@ -7,11 +7,13 @@ import signal
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from tornado.ioloop import IOLoop
|
from tornado.ioloop import IOLoop
|
||||||
from tornado.web import StaticFileHandler, Application
|
import tornado.web
|
||||||
|
from tornado.web import Application
|
||||||
|
|
||||||
from cache import Cache
|
from cache import Cache
|
||||||
from item.handlers import EpubHandler, ReaderHandler, FileHandler
|
from item.handlers import EpubHandler, ReaderHandler, FileHandler
|
||||||
from item.handlers import OMLHandler, UploadHandler
|
from item.handlers import OMLHandler, UploadHandler
|
||||||
|
from item.handlers import CropHandler
|
||||||
from item.icons import IconHandler
|
from item.icons import IconHandler
|
||||||
import db
|
import db
|
||||||
import node.server
|
import node.server
|
||||||
|
@ -29,6 +31,12 @@ import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class StaticFileHandler(tornado.web.StaticFileHandler):
|
||||||
|
def get_content_type(self):
|
||||||
|
if self.request.path.split('?')[0].endswith('.mjs'):
|
||||||
|
return 'application/javascript'
|
||||||
|
return super().get_content_type()
|
||||||
|
|
||||||
class MainHandler(OMLHandler):
|
class MainHandler(OMLHandler):
|
||||||
|
|
||||||
def get(self, path):
|
def get(self, path):
|
||||||
|
@ -126,6 +134,7 @@ def run():
|
||||||
(r'/(.*?)/get/', FileHandler, {
|
(r'/(.*?)/get/', FileHandler, {
|
||||||
'attachment': True
|
'attachment': True
|
||||||
}),
|
}),
|
||||||
|
(r'/(.*)/2048p(\d*),(\d*),(\d*),(\d*),(\d*).jpg', CropHandler),
|
||||||
(r'/(.*)/(cover|preview)(\d*).jpg', IconHandler),
|
(r'/(.*)/(cover|preview)(\d*).jpg', IconHandler),
|
||||||
]
|
]
|
||||||
handlers = common_handlers + [
|
handlers = common_handlers + [
|
||||||
|
|
Loading…
Reference in a new issue