extract detail from pdf

This commit is contained in:
j 2024-06-09 14:47:36 +01:00
commit 5bd561e64f
3 changed files with 75 additions and 5 deletions

View file

@ -10,6 +10,7 @@ from glob import glob
from datetime import datetime
from PyPDF2 import PdfFileReader
from PIL import Image
import ox
import settings
@ -24,13 +25,13 @@ def cover(pdf):
else:
return page(pdf, 1)
def ql_cover(pdf):
def ql_cover(pdf, size=1024):
tmp = tempfile.mkdtemp()
cmd = [
'qlmanage',
'-t',
'-s',
'1024',
str(size),
'-o',
tmp,
pdf
@ -48,7 +49,7 @@ def ql_cover(pdf):
shutil.rmtree(tmp)
return data
def page(pdf, page):
def page(pdf, page, size=1024):
tmp = tempfile.mkdtemp()
if sys.platform == 'win32':
pdf = get_short_path_name(pdf)
@ -57,7 +58,7 @@ def page(pdf, page):
pdf,
'-jpeg',
'-f', str(page), '-l', str(page),
'-scale-to', '1024', '-cropbox',
'-scale-to', str(size), '-cropbox',
os.path.join(tmp, 'page')
]
if sys.platform == 'win32':
@ -79,6 +80,47 @@ def page(pdf, page):
shutil.rmtree(tmp)
return data
def crop(pdf, page, left, top, right, bottom):
size = 2048
tmp = tempfile.mkdtemp()
if sys.platform == 'win32':
pdf = get_short_path_name(pdf)
cmd = [
'pdftocairo',
pdf,
'-jpeg',
'-f', str(page), '-l', str(page),
'-scale-to', str(size), '-cropbox',
os.path.join(tmp, 'page')
]
if sys.platform == 'win32':
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
startupinfo.wShowWindow = subprocess.SW_HIDE
p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo)
else:
p = subprocess.Popen(cmd, close_fds=True)
p.wait()
image = glob('%s/*' % tmp)
if image:
image = image[0]
crop = [int(p) for p in (left, top, right, bottom)]
print(crop)
img = Image.open(image).crop(crop)
img.save(image)
with open(image, 'rb') as fd:
data = fd.read()
else:
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
data = None
shutil.rmtree(tmp)
return data
'''
def page(pdf, page):
image = tempfile.mkstemp('.jpg')[1]
@ -281,3 +323,4 @@ def extract_isbn(text):
isbns = find_isbns(text)
if isbns:
return isbns[0]