update pypdf2

This commit is contained in:
j 2016-02-08 11:50:06 +05:30
parent b8b1fe89bd
commit 66205d529e
19 changed files with 626 additions and 315 deletions

View File

@ -1,32 +0,0 @@
Metadata-Version: 1.1
Name: PyPDF2
Version: 1.23
Summary: PDF toolkit
Home-page: http://mstamy2.github.com/PyPDF2
Author: Phaseit, Inc.
Author-email: PyPDF2@phaseit.net
License: UNKNOWN
Description:
A Pure-Python library built as a PDF toolkit. It is capable of:
- extracting document information (title, author, ...)
- splitting documents page by page
- merging documents page by page
- cropping pages
- merging multiple pages into a single page
- encrypting and decrypting PDF files
- and more!
By being Pure-Python, it should run on any Python platform without any
dependencies on external libraries. It can also work entirely on StringIO
objects rather than file streams, allowing for PDF manipulation in memory.
It is therefore a useful tool for websites that manage or manipulate PDFs.
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 3
Classifier: Operating System :: OS Independent
Classifier: Topic :: Software Development :: Libraries :: Python Modules

View File

@ -1,15 +0,0 @@
CHANGELOG
MANIFEST.in
PyPDF2/__init__.py
PyPDF2/_version.py
PyPDF2/filters.py
PyPDF2/generic.py
PyPDF2/merger.py
PyPDF2/pagerange.py
PyPDF2/pdf.py
PyPDF2/utils.py
PyPDF2/xmp.py
PyPDF2.egg-info/PKG-INFO
PyPDF2.egg-info/SOURCES.txt
PyPDF2.egg-info/dependency_links.txt
PyPDF2.egg-info/top_level.txt

View File

@ -1,23 +0,0 @@
../PyPDF2/filters.py
../PyPDF2/generic.py
../PyPDF2/merger.py
../PyPDF2/pagerange.py
../PyPDF2/pdf.py
../PyPDF2/utils.py
../PyPDF2/xmp.py
../PyPDF2/_version.py
../PyPDF2/__init__.py
../PyPDF2/__pycache__/filters.cpython-34.pyc
../PyPDF2/__pycache__/generic.cpython-34.pyc
../PyPDF2/__pycache__/merger.cpython-34.pyc
../PyPDF2/__pycache__/pagerange.cpython-34.pyc
../PyPDF2/__pycache__/pdf.cpython-34.pyc
../PyPDF2/__pycache__/utils.cpython-34.pyc
../PyPDF2/__pycache__/xmp.cpython-34.pyc
../PyPDF2/__pycache__/_version.cpython-34.pyc
../PyPDF2/__pycache__/__init__.cpython-34.pyc
./
top_level.txt
dependency_links.txt
PKG-INFO
SOURCES.txt

View File

@ -0,0 +1,17 @@
A Pure-Python library built as a PDF toolkit. It is capable of:
- extracting document information (title, author, ...)
- splitting documents page by page
- merging documents page by page
- cropping pages
- merging multiple pages into a single page
- encrypting and decrypting PDF files
- and more!
By being Pure-Python, it should run on any Python platform without any
dependencies on external libraries. It can also work entirely on StringIO
objects rather than file streams, allowing for PDF manipulation in memory.
It is therefore a useful tool for websites that manage or manipulate PDFs.

View File

@ -0,0 +1,34 @@
Metadata-Version: 2.0
Name: PyPDF2
Version: 1.25.1
Summary: PDF toolkit
Home-page: http://mstamy2.github.com/PyPDF2
Author: Phaseit, Inc.
Author-email: PyPDF2@phaseit.net
License: UNKNOWN
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 3
Classifier: Operating System :: OS Independent
Classifier: Topic :: Software Development :: Libraries :: Python Modules
A Pure-Python library built as a PDF toolkit. It is capable of:
- extracting document information (title, author, ...)
- splitting documents page by page
- merging documents page by page
- cropping pages
- merging multiple pages into a single page
- encrypting and decrypting PDF files
- and more!
By being Pure-Python, it should run on any Python platform without any
dependencies on external libraries. It can also work entirely on StringIO
objects rather than file streams, allowing for PDF manipulation in memory.
It is therefore a useful tool for websites that manage or manipulate PDFs.

View File

@ -0,0 +1,25 @@
PyPDF2/__init__.py,sha256=ugkP-3fEFZZ2-54PmYpjJ5CISEPD5W8TikZlloOJZ5M,210
PyPDF2/_version.py,sha256=ufPT1c1QzU2MdIAGUZ89UoQfl6t3IJdOjhMyLVhsDmQ,23
PyPDF2/filters.py,sha256=U4KQ7fJX129ePxoff-6-009e9kCWlj8_d2ipnm5QDG4,13167
PyPDF2/generic.py,sha256=bJ3e3PpqJCvTHrQ3IH3VEXMh1RWVqiCh9T1IcmkBuAo,45129
PyPDF2/merger.py,sha256=2Cz4QaB8R-Zm3V5P2rI-QYdqMZlN4geaAtNfrPbcTM4,21387
PyPDF2/pagerange.py,sha256=AEMerbVjzXE55sJ2EYZzBgH1Xt4NiUsHaiycoNaW8Ys,5534
PyPDF2/pdf.py,sha256=ceuZWSZIupSbzEzw6QrbNmN9D8PrdM6dh8zHSB9Rg2o,124907
PyPDF2/utils.py,sha256=-ZQky5qa4gsO0zprA8V_E5sTNRBSa_ungvxvxjdHr64,7833
PyPDF2/xmp.py,sha256=vdjDUAMCqb7-AhkuNaqCanviPHMpuJ-5adY8Kxe5jUc,13639
PyPDF2-1.25.1.dist-info/DESCRIPTION.rst,sha256=mCiWyCHYtsbQ22O_f2FbbD8CjW1GMfwvbn67J_THZ5M,600
PyPDF2-1.25.1.dist-info/METADATA,sha256=lGFpbQOrG5_oOYPi4GlzoQT4Lyj3eCvNEHIomSf4JsU,1174
PyPDF2-1.25.1.dist-info/RECORD,,
PyPDF2-1.25.1.dist-info/WHEEL,sha256=bfpjj1zBtYtglW1hWtnRCmhEcEV3TH8magB_ZQeGgSg,93
PyPDF2-1.25.1.dist-info/metadata.json,sha256=aVLfNzdnpxj8hyl12sDq-3IgfGH7t0g5gS2y6LPYtYE,692
PyPDF2-1.25.1.dist-info/top_level.txt,sha256=BERWrwqdvKXaVKhpnMbtO6b11qPA-mBt2r9a0VPF-Ow,7
/srv/openmedialibrary/platform/Shared/home/.local/lib/python3.5/site-packages/PyPDF2-1.25.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
PyPDF2/__pycache__/xmp.cpython-35.pyc,,
PyPDF2/__pycache__/utils.cpython-35.pyc,,
PyPDF2/__pycache__/pdf.cpython-35.pyc,,
PyPDF2/__pycache__/merger.cpython-35.pyc,,
PyPDF2/__pycache__/__init__.cpython-35.pyc,,
PyPDF2/__pycache__/generic.cpython-35.pyc,,
PyPDF2/__pycache__/filters.cpython-35.pyc,,
PyPDF2/__pycache__/pagerange.cpython-35.pyc,,
PyPDF2/__pycache__/_version.cpython-35.pyc,,

View File

@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.26.0)
Root-Is-Purelib: true
Tag: cp35-none-any

View File

@ -0,0 +1 @@
{"classifiers": ["Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Programming Language :: Python :: 2", "Programming Language :: Python :: 3", "Operating System :: OS Independent", "Topic :: Software Development :: Libraries :: Python Modules"], "extensions": {"python.details": {"contacts": [{"email": "PyPDF2@phaseit.net", "name": "Phaseit, Inc.", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "http://mstamy2.github.com/PyPDF2"}}}, "generator": "bdist_wheel (0.26.0)", "metadata_version": "2.0", "name": "PyPDF2", "summary": "PDF toolkit", "version": "1.25.1"}

View File

@ -1,2 +1 @@
__version__ = '1.23'
__version__ = '1.25.1'

View File

@ -40,28 +40,35 @@ if version_info < ( 3, 0 ):
from cStringIO import StringIO
else:
from io import StringIO
import struct
try:
import zlib
def decompress(data):
return zlib.decompress(data)
def compress(data):
return zlib.compress(data)
except ImportError:
# Unable to import zlib. Attempt to use the System.IO.Compression
# library from the .NET framework. (IronPython only)
import System
from System import IO, Collections, Array
def _string_to_bytearr(buf):
retval = Array.CreateInstance(System.Byte, len(buf))
for i in range(len(buf)):
retval[i] = ord(buf[i])
return retval
def _bytearr_to_string(bytes):
retval = ""
for i in range(bytes.Length):
retval += chr(bytes[i])
return retval
def _read_bytes(stream):
ms = IO.MemoryStream()
buf = Array.CreateInstance(System.Byte, 2048)
@ -74,6 +81,7 @@ except ImportError:
retval = ms.ToArray()
ms.Close()
return retval
def decompress(data):
bytes = _string_to_bytearr(data)
ms = IO.MemoryStream()
@ -84,6 +92,7 @@ except ImportError:
retval = _bytearr_to_string(bytes)
gz.Close()
return retval
def compress(data):
bytes = _string_to_bytearr(data)
ms = IO.MemoryStream()
@ -106,7 +115,7 @@ class FlateDecode(object):
predictor = decodeParms.get("/Predictor", 1)
except AttributeError:
pass # usually an array with a null object was read
# predictor 1 == no predictor
if predictor != 1:
columns = decodeParms["/Columns"]
@ -144,6 +153,7 @@ class FlateDecode(object):
return compress(data)
encode = staticmethod(encode)
class ASCIIHexDecode(object):
def decode(data, decodeParms=None):
retval = ""
@ -165,6 +175,7 @@ class ASCIIHexDecode(object):
return retval
decode = staticmethod(decode)
class LZWDecode(object):
"""Taken from:
http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm
@ -184,7 +195,6 @@ class LZWDecode(object):
def resetDict(self):
self.dictlen=258
self.bitspercode=9
def nextCode(self):
fillbits=self.bitspercode
@ -196,8 +206,8 @@ class LZWDecode(object):
bitsfromhere=8-self.bitpos
if bitsfromhere>fillbits:
bitsfromhere=fillbits
value |= (((nextbits >> (8-self.bitpos-bitsfromhere)) &
(0xff >> (8-bitsfromhere))) <<
value |= (((nextbits >> (8-self.bitpos-bitsfromhere)) &
(0xff >> (8-bitsfromhere))) <<
(fillbits-bitsfromhere))
fillbits -= bitsfromhere
self.bitpos += bitsfromhere
@ -235,70 +245,93 @@ class LZWDecode(object):
baos+=p
self.dict[self.dictlen] = p;
self.dictlen+=1
if (self.dictlen >= (1 << self.bitspercode) - 1 and
if (self.dictlen >= (1 << self.bitspercode) - 1 and
self.bitspercode < 12):
self.bitspercode+=1
return baos
@staticmethod
def decode(data,decodeParams=None):
return LZWDecode.decoder(data).decode()
class ASCII85Decode(object):
def decode(data, decodeParms=None):
retval = ""
group = []
x = 0
hitEod = False
# remove all whitespace from data
data = [y for y in data if not (y in ' \n\r\t')]
while not hitEod:
c = data[x]
if len(retval) == 0 and c == "<" and data[x+1] == "~":
x += 2
continue
#elif c.isspace():
# x += 1
# continue
elif c == 'z':
assert len(group) == 0
retval += '\x00\x00\x00\x00'
x += 1
continue
elif c == "~" and data[x+1] == ">":
if len(group) != 0:
# cannot have a final group of just 1 char
assert len(group) > 1
cnt = len(group) - 1
group += [ 85, 85, 85 ]
hitEod = cnt
if version_info < ( 3, 0 ):
retval = ""
group = []
x = 0
hitEod = False
# remove all whitespace from data
data = [y for y in data if not (y in ' \n\r\t')]
while not hitEod:
c = data[x]
if len(retval) == 0 and c == "<" and data[x+1] == "~":
x += 2
continue
#elif c.isspace():
# x += 1
# continue
elif c == 'z':
assert len(group) == 0
retval += '\x00\x00\x00\x00'
x += 1
continue
elif c == "~" and data[x+1] == ">":
if len(group) != 0:
# cannot have a final group of just 1 char
assert len(group) > 1
cnt = len(group) - 1
group += [ 85, 85, 85 ]
hitEod = cnt
else:
break
else:
c = ord(c) - 33
assert c >= 0 and c < 85
group += [ c ]
if len(group) >= 5:
b = group[0] * (85**4) + \
group[1] * (85**3) + \
group[2] * (85**2) + \
group[3] * 85 + \
group[4]
assert b < (2**32 - 1)
c4 = chr((b >> 0) % 256)
c3 = chr((b >> 8) % 256)
c2 = chr((b >> 16) % 256)
c1 = chr(b >> 24)
retval += (c1 + c2 + c3 + c4)
if hitEod:
retval = retval[:-4+hitEod]
group = []
x += 1
return retval
else:
if isinstance(data, str):
data = data.encode('ascii')
n = b = 0
out = bytearray()
for c in data:
if ord('!') <= c and c <= ord('u'):
n += 1
b = b*85+(c-33)
if n == 5:
out += struct.pack(b'>L',b)
n = b = 0
elif c == ord('z'):
assert n == 0
out += b'\0\0\0\0'
elif c == ord('~'):
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack(b'>L',b)[:n-1]
break
else:
c = ord(c) - 33
assert c >= 0 and c < 85
group += [ c ]
if len(group) >= 5:
b = group[0] * (85**4) + \
group[1] * (85**3) + \
group[2] * (85**2) + \
group[3] * 85 + \
group[4]
assert b < (2**32 - 1)
c4 = chr((b >> 0) % 256)
c3 = chr((b >> 8) % 256)
c2 = chr((b >> 16) % 256)
c1 = chr(b >> 24)
retval += (c1 + c2 + c3 + c4)
if hitEod:
retval = retval[:-4+hitEod]
group = []
x += 1
return retval
return bytes(out)
decode = staticmethod(decode)
def decodeStreamData(stream):
from .generic import NameObject
filters = stream.get("/Filter", ())
@ -306,22 +339,24 @@ def decodeStreamData(stream):
# we have a single filter instance
filters = (filters,)
data = stream._data
for filterType in filters:
if filterType == "/FlateDecode":
data = FlateDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/ASCIIHexDecode":
data = ASCIIHexDecode.decode(data)
elif filterType == "/LZWDecode":
data = LZWDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/ASCII85Decode":
data = ASCII85Decode.decode(data)
elif filterType == "/Crypt":
decodeParams = stream.get("/DecodeParams", {})
if "/Name" not in decodeParams and "/Type" not in decodeParams:
pass
# If there is not data to decode we should not try to decode the data.
if data:
for filterType in filters:
if filterType == "/FlateDecode" or filterType == "/Fl":
data = FlateDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/ASCIIHexDecode" or filterType == "/AHx":
data = ASCIIHexDecode.decode(data)
elif filterType == "/LZWDecode" or filterType == "/LZW":
data = LZWDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/ASCII85Decode" or filterType == "/A85":
data = ASCII85Decode.decode(data)
elif filterType == "/Crypt":
decodeParams = stream.get("/DecodeParams", {})
if "/Name" not in decodeParams and "/Type" not in decodeParams:
pass
else:
raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet")
else:
raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet")
else:
# unsupported filter
raise NotImplementedError("unsupported filter %s" % filterType)
# unsupported filter
raise NotImplementedError("unsupported filter %s" % filterType)
return data

View File

@ -43,11 +43,14 @@ from . import filters
from . import utils
import decimal
import codecs
import sys
#import debugging
ObjectPrefix = b_('/<[tf(n%')
NumberSigns = b_('+-')
IndirectPattern = re.compile(b_(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
def readObject(stream, pdf):
tok = stream.read(1)
stream.seek(-1, 1) # reset to start
@ -94,6 +97,7 @@ def readObject(stream, pdf):
else:
return NumberObject.readFromStream(stream)
class PdfObject(object):
def getObject(self):
"""Resolves indirect references."""
@ -225,6 +229,7 @@ class FloatObject(decimal.Decimal, PdfObject):
return decimal.Decimal.__new__(cls, utils.str_(value), context)
except:
return decimal.Decimal.__new__(cls, str(value))
def __repr__(self):
if self == self.to_integral():
return str(self.quantize(decimal.Decimal(1)))
@ -244,7 +249,11 @@ class NumberObject(int, PdfObject):
ByteDot = b_(".")
def __new__(cls, value):
return int.__new__(cls, value)
val = int(value)
try:
return int.__new__(cls, val)
except OverflowError:
return int.__new__(cls, 0)
def as_numeric(self):
return int(b_(repr(self)))
@ -253,16 +262,7 @@ class NumberObject(int, PdfObject):
stream.write(b_(repr(self)))
def readFromStream(stream):
num = b_("")
while True:
tok = stream.read(16)
m = NumberObject.NumberPattern.search(tok)
if m is not None:
stream.seek(m.start() - len(tok), 1)
num += tok[:m.start()]
break
num += tok
num = utils.readUntilRegex(stream, NumberObject.NumberPattern)
if num.find(NumberObject.ByteDot) != -1:
return FloatObject(num)
else:
@ -345,13 +345,18 @@ def readStringFromStream(stream):
tok = b_("\b")
elif tok == b_("f"):
tok = b_("\f")
elif tok == b_("c"):
tok = b_("\c")
elif tok == b_("("):
tok = b_("(")
elif tok == b_(")"):
tok = b_(")")
elif tok == b_("/"):
tok = b_("/")
elif tok == b_("\\"):
tok = b_("\\")
elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["), b_("]")):
elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["),
b_("]"), b_("#"), b_("_"), b_("&"), b_('$')):
# odd/unnessecary escape sequences we have encountered
tok = b_(tok)
elif tok.isdigit():
@ -378,7 +383,7 @@ def readStringFromStream(stream):
# line break was escaped:
tok = b_('')
else:
raise utils.PdfReadError("Unexpected escaped string")
raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok)
txt += tok
return createStringObject(txt)
@ -456,7 +461,7 @@ class TextStringObject(utils.string_type, PdfObject):
class NameObject(str, PdfObject):
delimiterPattern = re.compile(b_("\s+|[()<>[\]{}/%]"))
delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]"))
surfix = b_("/")
def writeToStream(self, stream, encryption_key):
@ -468,11 +473,12 @@ class NameObject(str, PdfObject):
name = stream.read(1)
if name != NameObject.surfix:
raise utils.PdfReadError("name read error")
name += utils.readUntilRegex(stream, NameObject.delimiterPattern)
name += utils.readUntilRegex(stream, NameObject.delimiterPattern,
ignore_eof=True)
if debug: print(name)
try:
return NameObject(name.decode('utf-8'))
except UnicodeDecodeError as e:
except (UnicodeEncodeError, UnicodeDecodeError) as e:
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
if not pdf.strict:
@ -630,6 +636,7 @@ class DictionaryObject(dict, PdfObject):
return retval
readFromStream = staticmethod(readFromStream)
class TreeObject(DictionaryObject):
def __init__(self):
DictionaryObject.__init__(self)
@ -726,7 +733,6 @@ class TreeObject(DictionaryObject):
found = True
break
prevRef = curRef
prev = cur
if NameObject('/Next') in cur:
@ -938,6 +944,7 @@ class RectangleObject(ArrayObject):
in (x,y) form.
"""
class Field(TreeObject):
"""
A class representing a field dictionary. This class is accessed through
@ -1009,6 +1016,7 @@ class Field(TreeObject):
See Section 8.5.2 of the PDF 1.7 reference.
"""
class Destination(TreeObject):
"""
A class representing a destination within a PDF file.
@ -1157,6 +1165,7 @@ def encode_pdfdocencoding(unicode_string):
"does not exist in translation table")
return retval
def decode_pdfdocencoding(byte_array):
retval = u_('')
for b in byte_array:
@ -1211,4 +1220,3 @@ for i in range(256):
continue
assert char not in _pdfDocEncoding_rev
_pdfDocEncoding_rev[char] = i

View File

@ -28,7 +28,7 @@
# POSSIBILITY OF SUCH DAMAGE.
from .generic import *
from .utils import string_type
from .utils import isString, str_
from .pdf import PdfFileReader, PdfFileWriter
from .pagerange import PageRange
from sys import version_info
@ -40,6 +40,7 @@ else:
from io import FileIO as file
StreamIO = BytesIO
class _MergedPage(object):
"""
_MergedPage is used internally by PdfFileMerger to collect necessary
@ -50,13 +51,14 @@ class _MergedPage(object):
self.pagedata = pagedata
self.out_pagedata = None
self.id = id
class PdfFileMerger(object):
"""
Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs
into a single PDF. It can concatenate, slice, insert, or any combination
of the above.
See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`)
and :meth:`write()<write>` for usage information.
@ -64,7 +66,7 @@ class PdfFileMerger(object):
problems and also causes some correctable problems to be fatal.
Defaults to ``True``.
"""
def __init__(self, strict=True):
self.inputs = []
self.pages = []
@ -73,7 +75,7 @@ class PdfFileMerger(object):
self.named_dests = []
self.id_count = 0
self.strict = strict
def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True):
"""
Merges the pages from the given file into the output file at the
@ -85,29 +87,30 @@ class PdfFileMerger(object):
:param fileobj: A File Object or an object that supports the standard read
and seek methods similar to a File Object. Could also be a
string representing a path to a PDF file.
:param str bookmark: Optionally, you may specify a bookmark to be applied at
the beginning of the included file by supplying the text of the bookmark.
:param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple
to merge only the specified range of pages from the source
document into the output document.
:param bool import_bookmarks: You may prevent the source document's bookmarks
from being imported by specifying this as ``False``.
"""
# This parameter is passed to self.inputs.append and means
# that the stream used was created in this method.
my_file = False
# If the fileobj parameter is a string, assume it is a path
# and create a file object at that location. If it is a file,
# copy the file's contents into a BytesIO (or StreamIO) stream object; if
# it is a PdfFileReader, copy that reader's stream into a
# copy the file's contents into a BytesIO (or StreamIO) stream object; if
# it is a PdfFileReader, copy that reader's stream into a
# BytesIO (or StreamIO) stream.
# If fileobj is none of the above types, it is not modified
if type(fileobj) == string_type:
decryption_key = None
if isString(fileobj):
fileobj = file(fileobj, 'rb')
my_file = True
elif isinstance(fileobj, file):
@ -116,17 +119,21 @@ class PdfFileMerger(object):
fileobj = StreamIO(filecontent)
my_file = True
elif isinstance(fileobj, PdfFileReader):
orig_tell = fileobj.stream.tell()
orig_tell = fileobj.stream.tell()
fileobj.stream.seek(0)
filecontent = StreamIO(fileobj.stream.read())
fileobj.stream.seek(orig_tell) # reset the stream to its original location
fileobj = filecontent
if hasattr(fileobj, '_decryption_key'):
decryption_key = fileobj._decryption_key
my_file = True
# Create a new PdfFileReader instance using the stream
# (either file or BytesIO or StringIO) created above
pdfr = PdfFileReader(fileobj, strict=self.strict)
if decryption_key is not None:
pdfr._decryption_key = decryption_key
# Find the range of pages to merge.
if pages == None:
pages = (0, pdfr.getNumPages())
@ -134,47 +141,45 @@ class PdfFileMerger(object):
pages = pages.indices(pdfr.getNumPages())
elif not isinstance(pages, tuple):
raise TypeError('"pages" must be a tuple of (start, stop[, step])')
srcpages = []
if bookmark:
bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit'))
outline = []
if import_bookmarks:
outline = pdfr.getOutlines()
outline = self._trim_outline(pdfr, outline, pages)
if bookmark:
self.bookmarks += [bookmark, outline]
else:
self.bookmarks += outline
dests = pdfr.namedDestinations
dests = self._trim_dests(pdfr, dests, pages)
self.named_dests += dests
# Gather all the pages that are going to be merged
for i in range(*pages):
pg = pdfr.getPage(i)
id = self.id_count
self.id_count += 1
mp = _MergedPage(pg, pdfr, id)
srcpages.append(mp)
self._associate_dests_to_pages(srcpages)
self._associate_bookmarks_to_pages(srcpages)
# Slice to insert the pages at the specified position
self.pages[position:position] = srcpages
# Keep track of our input files so we can close them later
self.inputs.append((fileobj, pdfr, my_file))
def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True):
"""
Identical to the :meth:`merge()<merge>` method, but assumes you want to concatenate
@ -183,7 +188,7 @@ class PdfFileMerger(object):
:param fileobj: A File Object or an object that supports the standard read
and seek methods similar to a File Object. Could also be a
string representing a path to a PDF file.
:param str bookmark: Optionally, you may specify a bookmark to be applied at
the beginning of the included file by supplying the text of the bookmark.
@ -194,10 +199,9 @@ class PdfFileMerger(object):
:param bool import_bookmarks: You may prevent the source document's bookmarks
from being imported by specifying this as ``False``.
"""
self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks)
def write(self, fileobj):
"""
Writes all data that has been merged to the given output file.
@ -206,11 +210,10 @@ class PdfFileMerger(object):
file-like object.
"""
my_file = False
if type(fileobj) in (str, str):
if isString(fileobj):
fileobj = file(fileobj, 'wb')
my_file = True
# Add pages to the PdfFileWriter
# The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13
for page in self.pages:
@ -222,15 +225,13 @@ class PdfFileMerger(object):
# Once all pages are added, create bookmarks to point at those pages
self._write_dests()
self._write_bookmarks()
# Write the output to the file
# Write the output to the file
self.output.write(fileobj)
if my_file:
fileobj.close()
def close(self):
"""
Shuts all file descriptors (input and output) and clears all memory
@ -240,7 +241,7 @@ class PdfFileMerger(object):
for fo, pdfr, mine in self.inputs:
if mine:
fo.close()
self.inputs = []
self.output = None
@ -253,7 +254,7 @@ class PdfFileMerger(object):
Example: ``{u'/Title': u'My title'}``
"""
self.output.addMetadata(infos)
def setPageLayout(self, layout):
"""
Set the page layout
@ -289,7 +290,7 @@ class PdfFileMerger(object):
def _trim_dests(self, pdf, dests, pages):
"""
Removes any named destinations that are not a part of the specified
Removes any named destinations that are not a part of the specified
page set.
"""
new_dests = []
@ -298,14 +299,14 @@ class PdfFileMerger(object):
for j in range(*pages):
if pdf.getPage(j).getObject() == o['/Page'].getObject():
o[NameObject('/Page')] = o['/Page'].getObject()
assert str(k) == str(o['/Title'])
assert str_(k) == str_(o['/Title'])
new_dests.append(o)
break
return new_dests
def _trim_outline(self, pdf, outline, pages):
"""
Removes any outline/bookmark entries that are not a part of the
Removes any outline/bookmark entries that are not a part of the
specified page set.
"""
new_outline = []
@ -326,10 +327,10 @@ class PdfFileMerger(object):
prev_header_added = True
break
return new_outline
def _write_dests(self):
dests = self.named_dests
for v in dests:
pageno = None
pdf = None
@ -342,19 +343,18 @@ class PdfFileMerger(object):
break
if pageno != None:
self.output.addNamedDestinationObject(v)
def _write_bookmarks(self, bookmarks=None, parent=None):
if bookmarks == None:
bookmarks = self.bookmarks
last_added = None
for b in bookmarks:
if isinstance(b, list):
self._write_bookmarks(b, last_added)
continue
pageno = None
pdf = None
if '/Page' in b:
@ -410,31 +410,31 @@ class PdfFileMerger(object):
del b['/Left'], b['/Right'], b['/Bottom'], b['/Top']
b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)})
pageno = i
pdf = p.src
break
if pageno != None:
del b['/Page'], b['/Type']
last_added = self.output.addBookmarkDict(b, parent)
last_added = self.output.addBookmarkDict(b, parent)
def _associate_dests_to_pages(self, pages):
for nd in self.named_dests:
pageno = None
np = nd['/Page']
if isinstance(np, NumberObject):
continue
for p in pages:
if np.getObject() == p.pagedata.getObject():
pageno = p.id
if pageno != None:
nd[NameObject('/Page')] = NumberObject(pageno)
else:
raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],))
def _associate_bookmarks_to_pages(self, pages, bookmarks=None):
if bookmarks == None:
bookmarks = self.bookmarks
@ -443,35 +443,35 @@ class PdfFileMerger(object):
if isinstance(b, list):
self._associate_bookmarks_to_pages(pages, b)
continue
pageno = None
bp = b['/Page']
if isinstance(bp, NumberObject):
continue
for p in pages:
if bp.getObject() == p.pagedata.getObject():
pageno = p.id
if pageno != None:
b[NameObject('/Page')] = NumberObject(pageno)
else:
raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],))
def findBookmark(self, bookmark, root=None):
if root == None:
root = self.bookmarks
for i, b in enumerate(root):
if isinstance(b, list):
res = self.findBookmark(bookmark, b)
if res:
return [i] + res
elif b == bookmark or b['/Title'] == bookmark:
return [i]
return None
if root == None:
root = self.bookmarks
for i, b in enumerate(root):
if isinstance(b, list):
res = self.findBookmark(bookmark, b)
if res:
return [i] + res
elif b == bookmark or b['/Title'] == bookmark:
return [i]
return None
def addBookmark(self, title, pagenum, parent=None):
"""
@ -483,28 +483,27 @@ class PdfFileMerger(object):
bookmarks.
"""
if parent == None:
iloc = [len(self.bookmarks)-1]
iloc = [len(self.bookmarks)-1]
elif isinstance(parent, list):
iloc = parent
iloc = parent
else:
iloc = self.findBookmark(parent)
iloc = self.findBookmark(parent)
dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
if parent == None:
self.bookmarks.append(dest)
self.bookmarks.append(dest)
else:
bmparent = self.bookmarks
for i in iloc[:-1]:
bmparent = bmparent[i]
npos = iloc[-1]+1
if npos < len(bmparent) and isinstance(bmparent[npos], list):
bmparent[npos].append(dest)
else:
bmparent.insert(npos, [dest])
bmparent = self.bookmarks
for i in iloc[:-1]:
bmparent = bmparent[i]
npos = iloc[-1]+1
if npos < len(bmparent) and isinstance(bmparent[npos], list):
bmparent[npos].append(dest)
else:
bmparent.insert(npos, [dest])
return dest
def addNamedDestination(self, title, pagenum):
"""
Add a destination to the output.
@ -512,7 +511,7 @@ class PdfFileMerger(object):
:param str title: Title to use
:param int pagenum: Page number this destination points at.
"""
dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
self.named_dests.append(dest)
@ -523,12 +522,12 @@ class OutlinesObject(list):
self.tree = tree
self.pdf = pdf
self.parent = parent
def remove(self, index):
obj = self[index]
del self[index]
self.tree.removeChild(obj)
def add(self, title, pagenum):
pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum]
action = DictionaryObject()
@ -547,7 +546,7 @@ class OutlinesObject(list):
self.pdf._addObject(bookmark)
self.tree.addChild(bookmark)
def removeAll(self):
for child in [x for x in self.tree.children()]:
self.tree.removeChild(child)

View File

@ -8,7 +8,7 @@ see https://github.com/mstamy2/PyPDF2/blob/master/LICENSE
"""
import re
from .utils import Str
from .utils import isString
_INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0".
PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE)
@ -32,11 +32,11 @@ PAGE_RANGE_HELP = """Remember, page indices start with zero.
::-1 all pages in reverse order.
"""
class PageRange(object):
"""
"""
A slice-like representation of a range of page indices,
i.e. page numbers, only starting at zero.
i.e. page numbers, only starting at zero.
The syntax is like what you would put between brackets [ ].
The slice is one of the few Python types that can't be subclassed,
but this class converts to and from slices, and allows similar use.
@ -46,7 +46,7 @@ class PageRange(object):
o str() and repr() allow printing.
o indices(n) is like slice.indices(n).
"""
def __init__(self, arg):
"""
Initialize with either a slice -- giving the equivalent page range,
@ -67,8 +67,8 @@ class PageRange(object):
if isinstance(arg, PageRange):
self._slice = arg.to_slice()
return
m = isinstance(arg, Str) and re.match(PAGE_RANGE_RE, arg)
m = isString(arg) and re.match(PAGE_RANGE_RE, arg)
if not m:
raise ParseError(arg)
elif m.group(2):
@ -77,25 +77,25 @@ class PageRange(object):
stop = start + 1 if start != -1 else None
self._slice = slice(start, stop)
else:
self._slice = slice(*[int(g) if g else None
self._slice = slice(*[int(g) if g else None
for g in m.group(4, 6, 8)])
# Just formatting this when there is __doc__ for __init__
if __init__.__doc__:
__init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP)
@staticmethod
def valid(input):
""" True if input is a valid initializer for a PageRange. """
return isinstance(input, slice) or \
isinstance(input, PageRange) or \
(isinstance(input, Str)
(isString(input)
and bool(re.match(PAGE_RANGE_RE, input)))
def to_slice(self):
""" Return the slice equivalent of this page range. """
return self._slice
def __str__(self):
""" A string like "1:2:3". """
s = self._slice
@ -127,7 +127,7 @@ def parse_filename_page_ranges(args):
"""
Given a list of filenames and page ranges, return a list of
(filename, page_range) pairs.
First arg must be a filename; other ags are filenames, page-range
First arg must be a filename; other ags are filenames, page-range
expressions, slice objects, or PageRange objects.
A filename not followed by a page range indicates all pages of the file.
"""
@ -146,7 +146,7 @@ def parse_filename_page_ranges(args):
# New filename or end of list--do all of the previous file?
if pdf_filename and not did_page_range:
pairs.append( (pdf_filename, PAGE_RANGE_ALL) )
pdf_filename = arg
did_page_range = False
return pairs

View File

@ -63,7 +63,7 @@ import warnings
import codecs
from .generic import *
from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
from .utils import Str, b_, u_, ord_, chr_, str_, string_type, formatWarning
from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning
if version_info < ( 2, 4 ):
from sets import ImmutableSet as frozenset
@ -74,6 +74,7 @@ else:
from hashlib import md5
import uuid
class PdfFileWriter(object):
"""
This class supports writing PDF files out, given pages produced by another
@ -228,6 +229,157 @@ class PdfFileWriter(object):
NameObject("/OpenAction"): self._addObject(js)
})
def addAttachment(self, fname, fdata):
"""
Embed a file inside the PDF.
:param str fname: The filename to display.
:param str fdata: The data in the file.
Reference:
https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
Section 7.11.3
"""
# We need 3 entries:
# * The file's data
# * The /Filespec entry
# * The file's name, which goes in the Catalog
# The entry for the file
""" Sample:
8 0 obj
<<
/Length 12
/Type /EmbeddedFile
>>
stream
Hello world!
endstream
endobj
"""
file_entry = DecodedStreamObject()
file_entry.setData(fdata)
file_entry.update({
NameObject("/Type"): NameObject("/EmbeddedFile")
})
# The Filespec entry
""" Sample:
7 0 obj
<<
/Type /Filespec
/F (hello.txt)
/EF << /F 8 0 R >>
>>
"""
efEntry = DictionaryObject()
efEntry.update({ NameObject("/F"):file_entry })
filespec = DictionaryObject()
filespec.update({
NameObject("/Type"): NameObject("/Filespec"),
NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject
NameObject("/EF"): efEntry
})
# Then create the entry for the root, as it needs a reference to the Filespec
""" Sample:
1 0 obj
<<
/Type /Catalog
/Outlines 2 0 R
/Pages 3 0 R
/Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>
>>
endobj
"""
embeddedFilesNamesDictionary = DictionaryObject()
embeddedFilesNamesDictionary.update({
NameObject("/Names"): ArrayObject([createStringObject(fname), filespec])
})
embeddedFilesDictionary = DictionaryObject()
embeddedFilesDictionary.update({
NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary
})
# Update the root
self._root_object.update({
NameObject("/Names"): embeddedFilesDictionary
})
def appendPagesFromReader(self, reader, after_page_append=None):
"""
Copy pages from reader to writer. Includes an optional callback parameter
which is invoked after pages are appended to the writer.
:param reader: a PdfFileReader object from which to copy page
annotations to this writer object. The writer's annots
will then be updated
:callback after_page_append (function): Callback function that is invoked after
each page is appended to the writer. Callback signature:
:param writer_pageref (PDF page reference): Reference to the page
appended to the writer.
"""
# Get page count from writer and reader
reader_num_pages = reader.getNumPages()
writer_num_pages = self.getNumPages()
# Copy pages from reader to writer
for rpagenum in range(0, reader_num_pages):
reader_page = reader.getPage(rpagenum)
self.addPage(reader_page)
writer_page = self.getPage(writer_num_pages+rpagenum)
# Trigger callback, pass writer page as parameter
if callable(after_page_append): after_page_append(writer_page)
def updatePageFormFieldValues(self, page, fields):
'''
Update the form field values for a given page from a fields dictionary.
Copy field texts and values from fields to page.
:param page: Page reference from PDF writer where the annotations
and field data will be updated.
:param fields: a Python dictionary of field names (/T) and text
values (/V)
'''
# Iterate through pages, update field values
for j in range(0, len(page['/Annots'])):
writer_annot = page['/Annots'][j].getObject()
for field in fields:
if writer_annot.get('/T') == field:
writer_annot.update({
NameObject("/V"): TextStringObject(fields[field])
})
def cloneReaderDocumentRoot(self, reader):
'''
Copy the reader document root to the writer.
:param reader: PdfFileReader from the document root should be copied.
:callback after_page_append
'''
self._root_object = reader.trailer['/Root']
def cloneDocumentFromReader(self, reader, after_page_append=None):
'''
Create a copy (clone) of a document from a PDF file reader
:param reader: PDF file reader instance from which the clone
should be created.
:callback after_page_append (function): Callback function that is invoked after
each page is appended to the writer. Signature includes a reference to the
appended page (delegates to appendPagesFromReader). Callback signature:
:param writer_pageref (PDF page reference): Reference to the page just
appended to the document.
'''
self.cloneReaderDocumentRoot(reader)
self.appendPagesFromReader(reader, after_page_append)
def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
"""
Encrypt this PDF file with the PDF Standard encryption handler.
@ -516,7 +668,6 @@ class PdfFileWriter(object):
return bookmarkRef
def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args):
"""
Add a bookmark to this PDF file.
@ -553,7 +704,6 @@ class PdfFileWriter(object):
if parent == None:
parent = outlineRef
bookmark = TreeObject()
bookmark.update({
@ -759,7 +909,7 @@ class PdfFileWriter(object):
else:
borderArr = [NumberObject(0)] * 3
if isinstance(rect, Str):
if isString(rect):
rect = NameObject(rect)
elif isinstance(rect, RectangleObject):
pass
@ -871,6 +1021,7 @@ class PdfFileWriter(object):
"""Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>`
and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods."""
class PdfFileReader(object):
"""
Initializes a PdfFileReader object. This operation can take some time, as
@ -904,9 +1055,10 @@ class PdfFileReader(object):
self.flattenedPages = None
self.resolvedObjects = {}
self.xrefIndex = 0
self._pageId2Num = None # map page IndirectRef number to Page Number
if hasattr(stream, 'mode') and 'b' not in stream.mode:
warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning)
if type(stream) in (string_type, str):
if isString(stream):
fileobj = open(stream, 'rb')
stream = BytesIO(b_(fileobj.read()))
fileobj.close()
@ -973,6 +1125,7 @@ class PdfFileReader(object):
if self.isEncrypted:
try:
self._override_encryption = True
self.decrypt('')
return self.trailer["/Root"]["/Pages"]["/Count"]
except:
raise utils.PdfReadError("File has not been decrypted")
@ -1160,7 +1313,14 @@ class PdfFileReader(object):
# get the outline dictionary and named destinations
if "/Outlines" in catalog:
lines = catalog["/Outlines"]
try:
lines = catalog["/Outlines"]
except utils.PdfReadError:
# this occurs if the /Outlines object reference is incorrect
# for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf
# so continue to load the file without the Bookmarks
return outlines
if "/First" in lines:
node = lines["/First"]
self._namedDests = self.getNamedDestinations()
@ -1187,6 +1347,49 @@ class PdfFileReader(object):
return outlines
def _getPageNumberByIndirect(self, indirectRef):
"""Generate _pageId2Num"""
if self._pageId2Num is None:
id2num = {}
for i, x in enumerate(self.pages):
id2num[x.indirectRef.idnum] = i
self._pageId2Num = id2num
if isinstance(indirectRef, int):
idnum = indirectRef
else:
idnum = indirectRef.idnum
ret = self._pageId2Num.get(idnum, -1)
return ret
def getPageNumber(self, page):
"""
Retrieve page number of a given PageObject
:param PageObject page: The page to get page number. Should be
an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
:return: the page number or -1 if page not found
:rtype: int
"""
indirectRef = page.indirectRef
ret = self._getPageNumberByIndirect(indirectRef)
return ret
def getDestinationPageNumber(self, destination):
"""
Retrieve page number of a given Destination object
:param Destination destination: The destination to get page number.
Should be an instance of
:class:`Destination<PyPDF2.pdf.Destination>`
:return: the page number or -1 if page not found
:rtype: int
"""
indirectRef = destination.page
ret = self._getPageNumberByIndirect(indirectRef)
return ret
def _buildDestination(self, title, array):
page, typ = array[0:2]
array = array[2:]
@ -1210,7 +1413,7 @@ class PdfFileReader(object):
if dest:
if isinstance(dest, ArrayObject):
outline = self._buildDestination(title, dest)
elif isinstance(dest, Str) and dest in self._namedDests:
elif isString(dest) and dest in self._namedDests:
outline = self._namedDests[dest]
outline[NameObject("/Title")] = title
else:
@ -1310,6 +1513,8 @@ class PdfFileReader(object):
assert idx < objStm['/N']
streamData = BytesIO(b_(objStm.getData()))
for i in range(objStm['/N']):
readNonWhitespace(streamData)
streamData.seek(-1, 1)
objnum = NumberObject.readFromStream(streamData)
readNonWhitespace(streamData)
streamData.seek(-1, 1)
@ -1347,7 +1552,6 @@ class PdfFileReader(object):
if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.")
return NullObject()
def getObject(self, indirectReference):
debug = False
if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation))
@ -1470,7 +1674,7 @@ class PdfFileReader(object):
startxref = int(line)
except ValueError:
# 'startxref' may be on the same line as the location
if not line.startswith("startxref"):
if not line.startswith(b_("startxref")):
raise utils.PdfReadError("startxref not found")
startxref = int(line[9:].strip())
warnings.warn("startxref on same line as offset")
@ -1580,6 +1784,7 @@ class PdfFileReader(object):
assert len(entrySizes) >= 3
if self.strict and len(entrySizes) > 3:
raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes)
def getEntry(i):
# Reads the correct number of bytes for each entry. See the
# discussion of the W parameter in PDF spec table 17.
@ -1664,8 +1869,7 @@ class PdfFileReader(object):
if found:
continue
# no xref table found at specified location
assert False
break
raise utils.PdfReadError("Could not find xref table at specified location")
#if not zero-indexed, verify that the table is correct; change it if necessary
if self.xrefIndex and not self.strict:
loc = stream.tell()
@ -1683,7 +1887,6 @@ class PdfFileReader(object):
#if not, then either it's just plain wrong, or the non-zero-index is actually correct
stream.seek(loc, 0) #return to where it was
def _zeroXref(self, generation):
self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) )
@ -1700,8 +1903,13 @@ class PdfFileReader(object):
if debug: print(">>readNextEndLine")
line = b_("")
while True:
# Prevent infinite loops in malformed PDFs
if stream.tell() == 0:
raise utils.PdfReadError("Could not read malformed PDF file")
x = stream.read(1)
if debug: print((" x:", x, "%x"%ord(x)))
if stream.tell() < 2:
raise utils.PdfReadError("EOL marker not found")
stream.seek(-2, 1)
if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR
crlf = False
@ -1713,6 +1921,8 @@ class PdfFileReader(object):
if x == b_('\n') or x == b_('\r'): # account for CR+LF
stream.seek(-1, 1)
crlf = True
if stream.tell() < 2:
raise utils.PdfReadError("EOL marker not found")
stream.seek(-2, 1)
stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1
break
@ -1827,14 +2037,17 @@ def getRectangle(self, name, defaults):
setRectangle(self, name, retval)
return retval
def setRectangle(self, name, value):
if not isinstance(name, NameObject):
name = NameObject(name)
self[name] = value
def deleteRectangle(self, name):
del self[name]
def createRectangleAccessor(name, fallback):
return \
property(
@ -1843,6 +2056,7 @@ def createRectangleAccessor(name, fallback):
lambda self: deleteRectangle(self, name)
)
class PageObject(DictionaryObject):
"""
This class represents a single page within a PDF file. Typically this
@ -2374,6 +2588,7 @@ class PageObject(DictionaryObject):
for i in operands[0]:
if isinstance(i, TextStringObject):
text += i
text += "\n"
return text
mediaBox = createRectangleAccessor("/MediaBox", ())
@ -2412,6 +2627,7 @@ class PageObject(DictionaryObject):
page's creator.
"""
class ContentStream(DecodedStreamObject):
def __init__(self, stream, pdf):
self.pdf = pdf
@ -2437,25 +2653,25 @@ class ContentStream(DecodedStreamObject):
if peek == b_('') or ord_(peek) == 0:
break
stream.seek(-1, 1)
if peek.isalpha() or peek == "'" or peek == '"':
if peek.isalpha() or peek == b_("'") or peek == b_('"'):
operator = utils.readUntilRegex(stream,
NameObject.delimiterPattern, True)
if operator == "BI":
if operator == b_("BI"):
# begin inline image - a completely different parsing
# mechanism is required, of course... thanks buddy...
assert operands == []
ii = self._readInlineImage(stream)
self.operations.append((ii, "INLINE IMAGE"))
self.operations.append((ii, b_("INLINE IMAGE")))
else:
self.operations.append((operands, operator))
operands = []
elif peek == '%':
elif peek == b_('%'):
# If we encounter a comment in the content stream, we have to
# handle it here. Typically, readObject will handle
# encountering a comment -- but readObject assumes that
# following the comment must be the object we're trying to
# read. In this case, it could be an operator instead.
while peek not in ('\r', '\n'):
while peek not in (b_('\r'), b_('\n')):
peek = stream.read(1)
else:
operands.append(readObject(stream, None))
@ -2467,7 +2683,7 @@ class ContentStream(DecodedStreamObject):
while True:
tok = readNonWhitespace(stream)
stream.seek(-1, 1)
if tok == "I":
if tok == b_("I"):
# "ID" - begin of image data
break
key = readObject(stream, self.pdf)
@ -2477,28 +2693,32 @@ class ContentStream(DecodedStreamObject):
settings[key] = value
# left at beginning of ID
tmp = stream.read(3)
assert tmp[:2] == "ID"
data = ""
assert tmp[:2] == b_("ID")
data = b_("")
while True:
# Read the inline image, while checking for EI (End Image) operator.
tok = stream.read(1)
if tok == "E":
if tok == b_("E"):
# Check for End Image
next1 = stream.read(1)
if next1 == "I":
next2 = readNonWhitespace(stream)
if next2 == 'Q':
tok2 = stream.read(1)
if tok2 == b_("I"):
# Sometimes that data will contain EI, so check for the Q operator.
tok3 = stream.read(1)
info = tok + tok2
while tok3 in utils.WHITESPACES:
info += tok3
tok3 = stream.read(1)
if tok3 == b_("Q"):
stream.seek(-1, 1)
break
else:
stream.seek(-2, 1)
data += tok
stream.seek(-1,1)
data += info
else:
stream.seek(-1, 1)
data += tok
else:
data += tok
x = readNonWhitespace(stream)
stream.seek(-1, 1)
return {"settings": settings, "data": data}
def _getData(self):
@ -2525,6 +2745,7 @@ class ContentStream(DecodedStreamObject):
_data = property(_getData, _setData)
class DocumentInformation(DictionaryObject):
"""
A class representing the basic document metadata provided in a PDF File.
@ -2588,6 +2809,7 @@ class DocumentInformation(DictionaryObject):
producer_raw = property(lambda self: self.get("/Producer"))
"""The "raw" version of producer; can return a ``ByteStringObject``."""
def convertToInt(d, size):
if size > 8:
raise utils.PdfReadError("invalid size in convertToInt")
@ -2600,6 +2822,7 @@ _encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \
b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \
b_('\xa9\xfe\x64\x53\x69\x7a')
# Implementation of algorithm 3.2 of the PDF standard security handler,
# section 3.5.2 of the PDF 1.6 reference.
def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
@ -2643,6 +2866,7 @@ def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
# entry.
return md5_hash[:keylen]
# Implementation of algorithm 3.3 of the PDF standard security handler,
# section 3.5.2 of the PDF 1.6 reference.
def _alg33(owner_pwd, user_pwd, rev, keylen):
@ -2670,6 +2894,7 @@ def _alg33(owner_pwd, user_pwd, rev, keylen):
# the /O entry in the encryption dictionary.
return val
# Steps 1-4 of algorithm 3.3
def _alg33_1(password, rev, keylen):
# 1. Pad or truncate the owner password string as described in step 1 of
@ -2692,6 +2917,7 @@ def _alg33_1(password, rev, keylen):
key = md5_hash[:keylen]
return key
# Implementation of algorithm 3.4 of the PDF standard security handler,
# section 3.5.2 of the PDF 1.6 reference.
def _alg34(password, owner_entry, p_entry, id1_entry):
@ -2706,6 +2932,7 @@ def _alg34(password, owner_entry, p_entry, id1_entry):
# encryption dictionary.
return U, key
# Implementation of algorithm 3.4 of the PDF standard security handler,
# section 3.5.2 of the PDF 1.6 reference.
def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):

View File

@ -33,25 +33,35 @@ __author_email__ = "biziqe@mathieu.fenniak.net"
import sys
# "Str" maintains compatibility with Python 2.x.
# The next line is obfuscated like this so 2to3 won't change it.
try:
import __builtin__ as builtins
except ImportError: # Py3
import builtins
if sys.version_info[0] < 3:
string_type = unicode
bytes_type = str
int_types = (int, long)
else:
string_type = str
bytes_type = bytes
int_types = (int,)
xrange_fn = getattr(builtins, "xrange", range)
_basestring = getattr(builtins, "basestring", str)
Xrange = getattr(builtins, "xrange", range)
Str = getattr(builtins, "basestring", str)
bytes_type = type(bytes()) # Works the same in Python 2.X and 3.X
string_type = getattr(builtins, "unicode", str)
int_types = (int, long) if sys.version_info[0] < 3 else (int,)
# Make basic type tests more consistent
def isString(s):
"""Test if arg is a string. Compatible with Python 2 and 3."""
return isinstance(s, _basestring)
def isInt(n):
"""Test if arg is an int. Compatible with Python 2 and 3."""
return isinstance(n, int_types)
def isBytes(b):
"""Test if arg is a bytes instance. Compatible with Python 2 and 3."""
return isinstance(b, bytes_type)
#custom implementation of warnings.formatwarning
@ -59,6 +69,7 @@ def formatWarning(message, category, filename, lineno, line=None):
file = filename.replace("/", "\\").rsplit("\\", 1)[1] # find the file name
return "%s: %s [%s:%s]\n" % (category.__name__, message, file, lineno)
def readUntilWhitespace(stream, maxchars=None):
"""
Reads non-whitespace characters and returns them.
@ -74,6 +85,7 @@ def readUntilWhitespace(stream, maxchars=None):
break
return txt
def readNonWhitespace(stream):
"""
Finds and reads the next non-whitespace character (ignores whitespace).
@ -83,6 +95,7 @@ def readNonWhitespace(stream):
tok = stream.read(1)
return tok
def skipOverWhitespace(stream):
"""
Similar to readNonWhitespace, but returns a Boolean if more than
@ -95,6 +108,7 @@ def skipOverWhitespace(stream):
cnt+=1
return (cnt > 1)
def skipOverComment(stream):
tok = stream.read(1)
stream.seek(-1, 1)
@ -102,6 +116,7 @@ def skipOverComment(stream):
while tok not in (b_('\n'), b_('\r')):
tok = stream.read(1)
def readUntilRegex(stream, regex, ignore_eof=False):
"""
Reads until the regular expression pattern matched (ignore the match)
@ -125,6 +140,7 @@ def readUntilRegex(stream, regex, ignore_eof=False):
name += tok
return name
class ConvertFunctionsToVirtualList(object):
def __init__(self, lengthFunction, getFunction):
self.lengthFunction = lengthFunction
@ -135,10 +151,10 @@ class ConvertFunctionsToVirtualList(object):
def __getitem__(self, index):
if isinstance(index, slice):
indices = Xrange(*index.indices(len(self)))
indices = xrange_fn(*index.indices(len(self)))
cls = type(self)
return cls(indices.__len__, lambda idx: self[indices[idx]])
if not isinstance(index, int_types):
if not isInt(index):
raise TypeError("sequence indices must be integers")
len_self = len(self)
if index < 0:
@ -148,6 +164,7 @@ class ConvertFunctionsToVirtualList(object):
raise IndexError("sequence index out of range")
return self.getFunction(index)
def RC4_encrypt(key, plaintext):
S = [i for i in range(256)]
j = 0
@ -164,12 +181,14 @@ def RC4_encrypt(key, plaintext):
retval += b_(chr(ord_(plaintext[x]) ^ t))
return retval
def matrixMultiply(a, b):
return [[sum([float(i)*float(j)
for i, j in zip(row, col)]
) for col in zip(*b)]
for row in a]
def markLocation(stream):
"""Creates text file showing current location in context."""
# Mainly for debugging
@ -182,18 +201,23 @@ def markLocation(stream):
outputDoc.close()
stream.seek(-RADIUS, 1)
class PyPdfError(Exception):
pass
class PdfReadError(PyPdfError):
pass
class PageSizeNotDefinedError(PyPdfError):
pass
class PdfReadWarning(UserWarning):
pass
class PdfStreamError(PdfReadError):
pass
@ -203,6 +227,7 @@ if sys.version_info[0] < 3:
return s
else:
B_CACHE = {}
def b_(s):
bc = B_CACHE
if s in bc:
@ -214,6 +239,8 @@ else:
if len(s) < 2:
bc[s] = r
return r
def u_(s):
if sys.version_info[0] < 3:
return unicode(s, 'unicode_escape')
@ -230,24 +257,28 @@ def str_(b):
else:
return b
def ord_(b):
if sys.version_info[0] < 3 or type(b) == str:
return ord(b)
else:
return b
def chr_(c):
if sys.version_info[0] < 3:
return c
else:
return chr(c)
def barray(b):
if sys.version_info[0] < 3:
return b
else:
return bytearray(b)
def hexencode(b):
if sys.version_info[0] < 3:
return b.encode('hex')
@ -256,6 +287,7 @@ def hexencode(b):
coder = codecs.getencoder('hex_codec')
return coder(b)[0]
def hexStr(num):
return hex(num).replace('L', '')

View File

@ -50,6 +50,7 @@ iso8601 = re.compile("""
)?
""", re.VERBOSE)
class XmpInformation(PdfObject):
"""
An object that represents Adobe XMP metadata.
@ -355,5 +356,3 @@ class XmpInformation(PdfObject):
:return: a dictionary of key/value items for custom metadata properties.
:rtype: dict
"""