extract detail from pdf
This commit is contained in:
parent
fd34ba305c
commit
5bd561e64f
3 changed files with 75 additions and 5 deletions
|
|
@ -10,6 +10,7 @@ from glob import glob
|
|||
from datetime import datetime
|
||||
|
||||
from PyPDF2 import PdfFileReader
|
||||
from PIL import Image
|
||||
import ox
|
||||
|
||||
import settings
|
||||
|
|
@ -24,13 +25,13 @@ def cover(pdf):
|
|||
else:
|
||||
return page(pdf, 1)
|
||||
|
||||
def ql_cover(pdf):
|
||||
def ql_cover(pdf, size=1024):
|
||||
tmp = tempfile.mkdtemp()
|
||||
cmd = [
|
||||
'qlmanage',
|
||||
'-t',
|
||||
'-s',
|
||||
'1024',
|
||||
str(size),
|
||||
'-o',
|
||||
tmp,
|
||||
pdf
|
||||
|
|
@ -48,7 +49,7 @@ def ql_cover(pdf):
|
|||
shutil.rmtree(tmp)
|
||||
return data
|
||||
|
||||
def page(pdf, page):
|
||||
def page(pdf, page, size=1024):
|
||||
tmp = tempfile.mkdtemp()
|
||||
if sys.platform == 'win32':
|
||||
pdf = get_short_path_name(pdf)
|
||||
|
|
@ -57,7 +58,7 @@ def page(pdf, page):
|
|||
pdf,
|
||||
'-jpeg',
|
||||
'-f', str(page), '-l', str(page),
|
||||
'-scale-to', '1024', '-cropbox',
|
||||
'-scale-to', str(size), '-cropbox',
|
||||
os.path.join(tmp, 'page')
|
||||
]
|
||||
if sys.platform == 'win32':
|
||||
|
|
@ -79,6 +80,47 @@ def page(pdf, page):
|
|||
shutil.rmtree(tmp)
|
||||
return data
|
||||
|
||||
def crop(pdf, page, left, top, right, bottom):
|
||||
size = 2048
|
||||
tmp = tempfile.mkdtemp()
|
||||
if sys.platform == 'win32':
|
||||
pdf = get_short_path_name(pdf)
|
||||
cmd = [
|
||||
'pdftocairo',
|
||||
pdf,
|
||||
'-jpeg',
|
||||
'-f', str(page), '-l', str(page),
|
||||
'-scale-to', str(size), '-cropbox',
|
||||
os.path.join(tmp, 'page')
|
||||
]
|
||||
if sys.platform == 'win32':
|
||||
startupinfo = subprocess.STARTUPINFO()
|
||||
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
|
||||
startupinfo.wShowWindow = subprocess.SW_HIDE
|
||||
p = subprocess.Popen(cmd, close_fds=True, startupinfo=startupinfo)
|
||||
else:
|
||||
p = subprocess.Popen(cmd, close_fds=True)
|
||||
p.wait()
|
||||
image = glob('%s/*' % tmp)
|
||||
if image:
|
||||
image = image[0]
|
||||
crop = [int(p) for p in (left, top, right, bottom)]
|
||||
print(crop)
|
||||
img = Image.open(image).crop(crop)
|
||||
img.save(image)
|
||||
with open(image, 'rb') as fd:
|
||||
data = fd.read()
|
||||
else:
|
||||
logger.debug('pdftocairo %s %s', pdf, ' '.join(cmd))
|
||||
data = None
|
||||
shutil.rmtree(tmp)
|
||||
return data
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
'''
|
||||
def page(pdf, page):
|
||||
image = tempfile.mkstemp('.jpg')[1]
|
||||
|
|
@ -281,3 +323,4 @@ def extract_isbn(text):
|
|||
isbns = find_isbns(text)
|
||||
if isbns:
|
||||
return isbns[0]
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue