update pypdf2

This commit is contained in:
j 2016-02-08 11:50:06 +05:30
parent b8b1fe89bd
commit 66205d529e
19 changed files with 626 additions and 315 deletions

View file

@ -1,32 +0,0 @@
Metadata-Version: 1.1
Name: PyPDF2
Version: 1.23
Summary: PDF toolkit
Home-page: http://mstamy2.github.com/PyPDF2
Author: Phaseit, Inc.
Author-email: PyPDF2@phaseit.net
License: UNKNOWN
Description:
A Pure-Python library built as a PDF toolkit. It is capable of:
- extracting document information (title, author, ...)
- splitting documents page by page
- merging documents page by page
- cropping pages
- merging multiple pages into a single page
- encrypting and decrypting PDF files
- and more!
By being Pure-Python, it should run on any Python platform without any
dependencies on external libraries. It can also work entirely on StringIO
objects rather than file streams, allowing for PDF manipulation in memory.
It is therefore a useful tool for websites that manage or manipulate PDFs.
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 3
Classifier: Operating System :: OS Independent
Classifier: Topic :: Software Development :: Libraries :: Python Modules

View file

@ -1,15 +0,0 @@
CHANGELOG
MANIFEST.in
PyPDF2/__init__.py
PyPDF2/_version.py
PyPDF2/filters.py
PyPDF2/generic.py
PyPDF2/merger.py
PyPDF2/pagerange.py
PyPDF2/pdf.py
PyPDF2/utils.py
PyPDF2/xmp.py
PyPDF2.egg-info/PKG-INFO
PyPDF2.egg-info/SOURCES.txt
PyPDF2.egg-info/dependency_links.txt
PyPDF2.egg-info/top_level.txt

View file

@ -1,23 +0,0 @@
../PyPDF2/filters.py
../PyPDF2/generic.py
../PyPDF2/merger.py
../PyPDF2/pagerange.py
../PyPDF2/pdf.py
../PyPDF2/utils.py
../PyPDF2/xmp.py
../PyPDF2/_version.py
../PyPDF2/__init__.py
../PyPDF2/__pycache__/filters.cpython-34.pyc
../PyPDF2/__pycache__/generic.cpython-34.pyc
../PyPDF2/__pycache__/merger.cpython-34.pyc
../PyPDF2/__pycache__/pagerange.cpython-34.pyc
../PyPDF2/__pycache__/pdf.cpython-34.pyc
../PyPDF2/__pycache__/utils.cpython-34.pyc
../PyPDF2/__pycache__/xmp.cpython-34.pyc
../PyPDF2/__pycache__/_version.cpython-34.pyc
../PyPDF2/__pycache__/__init__.cpython-34.pyc
./
top_level.txt
dependency_links.txt
PKG-INFO
SOURCES.txt

View file

@ -0,0 +1,17 @@
A Pure-Python library built as a PDF toolkit. It is capable of:
- extracting document information (title, author, ...)
- splitting documents page by page
- merging documents page by page
- cropping pages
- merging multiple pages into a single page
- encrypting and decrypting PDF files
- and more!
By being Pure-Python, it should run on any Python platform without any
dependencies on external libraries. It can also work entirely on StringIO
objects rather than file streams, allowing for PDF manipulation in memory.
It is therefore a useful tool for websites that manage or manipulate PDFs.

View file

@ -0,0 +1,34 @@
Metadata-Version: 2.0
Name: PyPDF2
Version: 1.25.1
Summary: PDF toolkit
Home-page: http://mstamy2.github.com/PyPDF2
Author: Phaseit, Inc.
Author-email: PyPDF2@phaseit.net
License: UNKNOWN
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 3
Classifier: Operating System :: OS Independent
Classifier: Topic :: Software Development :: Libraries :: Python Modules
A Pure-Python library built as a PDF toolkit. It is capable of:
- extracting document information (title, author, ...)
- splitting documents page by page
- merging documents page by page
- cropping pages
- merging multiple pages into a single page
- encrypting and decrypting PDF files
- and more!
By being Pure-Python, it should run on any Python platform without any
dependencies on external libraries. It can also work entirely on StringIO
objects rather than file streams, allowing for PDF manipulation in memory.
It is therefore a useful tool for websites that manage or manipulate PDFs.

View file

@ -0,0 +1,25 @@
PyPDF2/__init__.py,sha256=ugkP-3fEFZZ2-54PmYpjJ5CISEPD5W8TikZlloOJZ5M,210
PyPDF2/_version.py,sha256=ufPT1c1QzU2MdIAGUZ89UoQfl6t3IJdOjhMyLVhsDmQ,23
PyPDF2/filters.py,sha256=U4KQ7fJX129ePxoff-6-009e9kCWlj8_d2ipnm5QDG4,13167
PyPDF2/generic.py,sha256=bJ3e3PpqJCvTHrQ3IH3VEXMh1RWVqiCh9T1IcmkBuAo,45129
PyPDF2/merger.py,sha256=2Cz4QaB8R-Zm3V5P2rI-QYdqMZlN4geaAtNfrPbcTM4,21387
PyPDF2/pagerange.py,sha256=AEMerbVjzXE55sJ2EYZzBgH1Xt4NiUsHaiycoNaW8Ys,5534
PyPDF2/pdf.py,sha256=ceuZWSZIupSbzEzw6QrbNmN9D8PrdM6dh8zHSB9Rg2o,124907
PyPDF2/utils.py,sha256=-ZQky5qa4gsO0zprA8V_E5sTNRBSa_ungvxvxjdHr64,7833
PyPDF2/xmp.py,sha256=vdjDUAMCqb7-AhkuNaqCanviPHMpuJ-5adY8Kxe5jUc,13639
PyPDF2-1.25.1.dist-info/DESCRIPTION.rst,sha256=mCiWyCHYtsbQ22O_f2FbbD8CjW1GMfwvbn67J_THZ5M,600
PyPDF2-1.25.1.dist-info/METADATA,sha256=lGFpbQOrG5_oOYPi4GlzoQT4Lyj3eCvNEHIomSf4JsU,1174
PyPDF2-1.25.1.dist-info/RECORD,,
PyPDF2-1.25.1.dist-info/WHEEL,sha256=bfpjj1zBtYtglW1hWtnRCmhEcEV3TH8magB_ZQeGgSg,93
PyPDF2-1.25.1.dist-info/metadata.json,sha256=aVLfNzdnpxj8hyl12sDq-3IgfGH7t0g5gS2y6LPYtYE,692
PyPDF2-1.25.1.dist-info/top_level.txt,sha256=BERWrwqdvKXaVKhpnMbtO6b11qPA-mBt2r9a0VPF-Ow,7
/srv/openmedialibrary/platform/Shared/home/.local/lib/python3.5/site-packages/PyPDF2-1.25.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
PyPDF2/__pycache__/xmp.cpython-35.pyc,,
PyPDF2/__pycache__/utils.cpython-35.pyc,,
PyPDF2/__pycache__/pdf.cpython-35.pyc,,
PyPDF2/__pycache__/merger.cpython-35.pyc,,
PyPDF2/__pycache__/__init__.cpython-35.pyc,,
PyPDF2/__pycache__/generic.cpython-35.pyc,,
PyPDF2/__pycache__/filters.cpython-35.pyc,,
PyPDF2/__pycache__/pagerange.cpython-35.pyc,,
PyPDF2/__pycache__/_version.cpython-35.pyc,,

View file

@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.26.0)
Root-Is-Purelib: true
Tag: cp35-none-any

View file

@ -0,0 +1 @@
{"classifiers": ["Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Programming Language :: Python :: 2", "Programming Language :: Python :: 3", "Operating System :: OS Independent", "Topic :: Software Development :: Libraries :: Python Modules"], "extensions": {"python.details": {"contacts": [{"email": "PyPDF2@phaseit.net", "name": "Phaseit, Inc.", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "http://mstamy2.github.com/PyPDF2"}}}, "generator": "bdist_wheel (0.26.0)", "metadata_version": "2.0", "name": "PyPDF2", "summary": "PDF toolkit", "version": "1.25.1"}

View file

@ -1,2 +1 @@
__version__ = '1.23' __version__ = '1.25.1'

View file

@ -40,28 +40,35 @@ if version_info < ( 3, 0 ):
from cStringIO import StringIO from cStringIO import StringIO
else: else:
from io import StringIO from io import StringIO
import struct
try: try:
import zlib import zlib
def decompress(data): def decompress(data):
return zlib.decompress(data) return zlib.decompress(data)
def compress(data): def compress(data):
return zlib.compress(data) return zlib.compress(data)
except ImportError: except ImportError:
# Unable to import zlib. Attempt to use the System.IO.Compression # Unable to import zlib. Attempt to use the System.IO.Compression
# library from the .NET framework. (IronPython only) # library from the .NET framework. (IronPython only)
import System import System
from System import IO, Collections, Array from System import IO, Collections, Array
def _string_to_bytearr(buf): def _string_to_bytearr(buf):
retval = Array.CreateInstance(System.Byte, len(buf)) retval = Array.CreateInstance(System.Byte, len(buf))
for i in range(len(buf)): for i in range(len(buf)):
retval[i] = ord(buf[i]) retval[i] = ord(buf[i])
return retval return retval
def _bytearr_to_string(bytes): def _bytearr_to_string(bytes):
retval = "" retval = ""
for i in range(bytes.Length): for i in range(bytes.Length):
retval += chr(bytes[i]) retval += chr(bytes[i])
return retval return retval
def _read_bytes(stream): def _read_bytes(stream):
ms = IO.MemoryStream() ms = IO.MemoryStream()
buf = Array.CreateInstance(System.Byte, 2048) buf = Array.CreateInstance(System.Byte, 2048)
@ -74,6 +81,7 @@ except ImportError:
retval = ms.ToArray() retval = ms.ToArray()
ms.Close() ms.Close()
return retval return retval
def decompress(data): def decompress(data):
bytes = _string_to_bytearr(data) bytes = _string_to_bytearr(data)
ms = IO.MemoryStream() ms = IO.MemoryStream()
@ -84,6 +92,7 @@ except ImportError:
retval = _bytearr_to_string(bytes) retval = _bytearr_to_string(bytes)
gz.Close() gz.Close()
return retval return retval
def compress(data): def compress(data):
bytes = _string_to_bytearr(data) bytes = _string_to_bytearr(data)
ms = IO.MemoryStream() ms = IO.MemoryStream()
@ -106,7 +115,7 @@ class FlateDecode(object):
predictor = decodeParms.get("/Predictor", 1) predictor = decodeParms.get("/Predictor", 1)
except AttributeError: except AttributeError:
pass # usually an array with a null object was read pass # usually an array with a null object was read
# predictor 1 == no predictor # predictor 1 == no predictor
if predictor != 1: if predictor != 1:
columns = decodeParms["/Columns"] columns = decodeParms["/Columns"]
@ -144,6 +153,7 @@ class FlateDecode(object):
return compress(data) return compress(data)
encode = staticmethod(encode) encode = staticmethod(encode)
class ASCIIHexDecode(object): class ASCIIHexDecode(object):
def decode(data, decodeParms=None): def decode(data, decodeParms=None):
retval = "" retval = ""
@ -165,6 +175,7 @@ class ASCIIHexDecode(object):
return retval return retval
decode = staticmethod(decode) decode = staticmethod(decode)
class LZWDecode(object): class LZWDecode(object):
"""Taken from: """Taken from:
http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm
@ -184,7 +195,6 @@ class LZWDecode(object):
def resetDict(self): def resetDict(self):
self.dictlen=258 self.dictlen=258
self.bitspercode=9 self.bitspercode=9
def nextCode(self): def nextCode(self):
fillbits=self.bitspercode fillbits=self.bitspercode
@ -196,8 +206,8 @@ class LZWDecode(object):
bitsfromhere=8-self.bitpos bitsfromhere=8-self.bitpos
if bitsfromhere>fillbits: if bitsfromhere>fillbits:
bitsfromhere=fillbits bitsfromhere=fillbits
value |= (((nextbits >> (8-self.bitpos-bitsfromhere)) & value |= (((nextbits >> (8-self.bitpos-bitsfromhere)) &
(0xff >> (8-bitsfromhere))) << (0xff >> (8-bitsfromhere))) <<
(fillbits-bitsfromhere)) (fillbits-bitsfromhere))
fillbits -= bitsfromhere fillbits -= bitsfromhere
self.bitpos += bitsfromhere self.bitpos += bitsfromhere
@ -235,70 +245,93 @@ class LZWDecode(object):
baos+=p baos+=p
self.dict[self.dictlen] = p; self.dict[self.dictlen] = p;
self.dictlen+=1 self.dictlen+=1
if (self.dictlen >= (1 << self.bitspercode) - 1 and if (self.dictlen >= (1 << self.bitspercode) - 1 and
self.bitspercode < 12): self.bitspercode < 12):
self.bitspercode+=1 self.bitspercode+=1
return baos return baos
@staticmethod @staticmethod
def decode(data,decodeParams=None): def decode(data,decodeParams=None):
return LZWDecode.decoder(data).decode() return LZWDecode.decoder(data).decode()
class ASCII85Decode(object): class ASCII85Decode(object):
def decode(data, decodeParms=None): def decode(data, decodeParms=None):
retval = "" if version_info < ( 3, 0 ):
group = [] retval = ""
x = 0 group = []
hitEod = False x = 0
# remove all whitespace from data hitEod = False
data = [y for y in data if not (y in ' \n\r\t')] # remove all whitespace from data
while not hitEod: data = [y for y in data if not (y in ' \n\r\t')]
c = data[x] while not hitEod:
if len(retval) == 0 and c == "<" and data[x+1] == "~": c = data[x]
x += 2 if len(retval) == 0 and c == "<" and data[x+1] == "~":
continue x += 2
#elif c.isspace(): continue
# x += 1 #elif c.isspace():
# continue # x += 1
elif c == 'z': # continue
assert len(group) == 0 elif c == 'z':
retval += '\x00\x00\x00\x00' assert len(group) == 0
x += 1 retval += '\x00\x00\x00\x00'
continue x += 1
elif c == "~" and data[x+1] == ">": continue
if len(group) != 0: elif c == "~" and data[x+1] == ">":
# cannot have a final group of just 1 char if len(group) != 0:
assert len(group) > 1 # cannot have a final group of just 1 char
cnt = len(group) - 1 assert len(group) > 1
group += [ 85, 85, 85 ] cnt = len(group) - 1
hitEod = cnt group += [ 85, 85, 85 ]
hitEod = cnt
else:
break
else: else:
c = ord(c) - 33
assert c >= 0 and c < 85
group += [ c ]
if len(group) >= 5:
b = group[0] * (85**4) + \
group[1] * (85**3) + \
group[2] * (85**2) + \
group[3] * 85 + \
group[4]
assert b < (2**32 - 1)
c4 = chr((b >> 0) % 256)
c3 = chr((b >> 8) % 256)
c2 = chr((b >> 16) % 256)
c1 = chr(b >> 24)
retval += (c1 + c2 + c3 + c4)
if hitEod:
retval = retval[:-4+hitEod]
group = []
x += 1
return retval
else:
if isinstance(data, str):
data = data.encode('ascii')
n = b = 0
out = bytearray()
for c in data:
if ord('!') <= c and c <= ord('u'):
n += 1
b = b*85+(c-33)
if n == 5:
out += struct.pack(b'>L',b)
n = b = 0
elif c == ord('z'):
assert n == 0
out += b'\0\0\0\0'
elif c == ord('~'):
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack(b'>L',b)[:n-1]
break break
else: return bytes(out)
c = ord(c) - 33
assert c >= 0 and c < 85
group += [ c ]
if len(group) >= 5:
b = group[0] * (85**4) + \
group[1] * (85**3) + \
group[2] * (85**2) + \
group[3] * 85 + \
group[4]
assert b < (2**32 - 1)
c4 = chr((b >> 0) % 256)
c3 = chr((b >> 8) % 256)
c2 = chr((b >> 16) % 256)
c1 = chr(b >> 24)
retval += (c1 + c2 + c3 + c4)
if hitEod:
retval = retval[:-4+hitEod]
group = []
x += 1
return retval
decode = staticmethod(decode) decode = staticmethod(decode)
def decodeStreamData(stream): def decodeStreamData(stream):
from .generic import NameObject from .generic import NameObject
filters = stream.get("/Filter", ()) filters = stream.get("/Filter", ())
@ -306,22 +339,24 @@ def decodeStreamData(stream):
# we have a single filter instance # we have a single filter instance
filters = (filters,) filters = (filters,)
data = stream._data data = stream._data
for filterType in filters: # If there is not data to decode we should not try to decode the data.
if filterType == "/FlateDecode": if data:
data = FlateDecode.decode(data, stream.get("/DecodeParms")) for filterType in filters:
elif filterType == "/ASCIIHexDecode": if filterType == "/FlateDecode" or filterType == "/Fl":
data = ASCIIHexDecode.decode(data) data = FlateDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/LZWDecode": elif filterType == "/ASCIIHexDecode" or filterType == "/AHx":
data = LZWDecode.decode(data, stream.get("/DecodeParms")) data = ASCIIHexDecode.decode(data)
elif filterType == "/ASCII85Decode": elif filterType == "/LZWDecode" or filterType == "/LZW":
data = ASCII85Decode.decode(data) data = LZWDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/Crypt": elif filterType == "/ASCII85Decode" or filterType == "/A85":
decodeParams = stream.get("/DecodeParams", {}) data = ASCII85Decode.decode(data)
if "/Name" not in decodeParams and "/Type" not in decodeParams: elif filterType == "/Crypt":
pass decodeParams = stream.get("/DecodeParams", {})
if "/Name" not in decodeParams and "/Type" not in decodeParams:
pass
else:
raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet")
else: else:
raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet") # unsupported filter
else: raise NotImplementedError("unsupported filter %s" % filterType)
# unsupported filter
raise NotImplementedError("unsupported filter %s" % filterType)
return data return data

View file

@ -43,11 +43,14 @@ from . import filters
from . import utils from . import utils
import decimal import decimal
import codecs import codecs
import sys
#import debugging #import debugging
ObjectPrefix = b_('/<[tf(n%') ObjectPrefix = b_('/<[tf(n%')
NumberSigns = b_('+-') NumberSigns = b_('+-')
IndirectPattern = re.compile(b_(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]")) IndirectPattern = re.compile(b_(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
def readObject(stream, pdf): def readObject(stream, pdf):
tok = stream.read(1) tok = stream.read(1)
stream.seek(-1, 1) # reset to start stream.seek(-1, 1) # reset to start
@ -94,6 +97,7 @@ def readObject(stream, pdf):
else: else:
return NumberObject.readFromStream(stream) return NumberObject.readFromStream(stream)
class PdfObject(object): class PdfObject(object):
def getObject(self): def getObject(self):
"""Resolves indirect references.""" """Resolves indirect references."""
@ -225,6 +229,7 @@ class FloatObject(decimal.Decimal, PdfObject):
return decimal.Decimal.__new__(cls, utils.str_(value), context) return decimal.Decimal.__new__(cls, utils.str_(value), context)
except: except:
return decimal.Decimal.__new__(cls, str(value)) return decimal.Decimal.__new__(cls, str(value))
def __repr__(self): def __repr__(self):
if self == self.to_integral(): if self == self.to_integral():
return str(self.quantize(decimal.Decimal(1))) return str(self.quantize(decimal.Decimal(1)))
@ -244,7 +249,11 @@ class NumberObject(int, PdfObject):
ByteDot = b_(".") ByteDot = b_(".")
def __new__(cls, value): def __new__(cls, value):
return int.__new__(cls, value) val = int(value)
try:
return int.__new__(cls, val)
except OverflowError:
return int.__new__(cls, 0)
def as_numeric(self): def as_numeric(self):
return int(b_(repr(self))) return int(b_(repr(self)))
@ -253,16 +262,7 @@ class NumberObject(int, PdfObject):
stream.write(b_(repr(self))) stream.write(b_(repr(self)))
def readFromStream(stream): def readFromStream(stream):
num = b_("") num = utils.readUntilRegex(stream, NumberObject.NumberPattern)
while True:
tok = stream.read(16)
m = NumberObject.NumberPattern.search(tok)
if m is not None:
stream.seek(m.start() - len(tok), 1)
num += tok[:m.start()]
break
num += tok
if num.find(NumberObject.ByteDot) != -1: if num.find(NumberObject.ByteDot) != -1:
return FloatObject(num) return FloatObject(num)
else: else:
@ -345,13 +345,18 @@ def readStringFromStream(stream):
tok = b_("\b") tok = b_("\b")
elif tok == b_("f"): elif tok == b_("f"):
tok = b_("\f") tok = b_("\f")
elif tok == b_("c"):
tok = b_("\c")
elif tok == b_("("): elif tok == b_("("):
tok = b_("(") tok = b_("(")
elif tok == b_(")"): elif tok == b_(")"):
tok = b_(")") tok = b_(")")
elif tok == b_("/"):
tok = b_("/")
elif tok == b_("\\"): elif tok == b_("\\"):
tok = b_("\\") tok = b_("\\")
elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["), b_("]")): elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["),
b_("]"), b_("#"), b_("_"), b_("&"), b_('$')):
# odd/unnessecary escape sequences we have encountered # odd/unnessecary escape sequences we have encountered
tok = b_(tok) tok = b_(tok)
elif tok.isdigit(): elif tok.isdigit():
@ -378,7 +383,7 @@ def readStringFromStream(stream):
# line break was escaped: # line break was escaped:
tok = b_('') tok = b_('')
else: else:
raise utils.PdfReadError("Unexpected escaped string") raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok)
txt += tok txt += tok
return createStringObject(txt) return createStringObject(txt)
@ -456,7 +461,7 @@ class TextStringObject(utils.string_type, PdfObject):
class NameObject(str, PdfObject): class NameObject(str, PdfObject):
delimiterPattern = re.compile(b_("\s+|[()<>[\]{}/%]")) delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]"))
surfix = b_("/") surfix = b_("/")
def writeToStream(self, stream, encryption_key): def writeToStream(self, stream, encryption_key):
@ -468,11 +473,12 @@ class NameObject(str, PdfObject):
name = stream.read(1) name = stream.read(1)
if name != NameObject.surfix: if name != NameObject.surfix:
raise utils.PdfReadError("name read error") raise utils.PdfReadError("name read error")
name += utils.readUntilRegex(stream, NameObject.delimiterPattern) name += utils.readUntilRegex(stream, NameObject.delimiterPattern,
ignore_eof=True)
if debug: print(name) if debug: print(name)
try: try:
return NameObject(name.decode('utf-8')) return NameObject(name.decode('utf-8'))
except UnicodeDecodeError as e: except (UnicodeEncodeError, UnicodeDecodeError) as e:
# Name objects should represent irregular characters # Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number # with a '#' followed by the symbol's hex number
if not pdf.strict: if not pdf.strict:
@ -630,6 +636,7 @@ class DictionaryObject(dict, PdfObject):
return retval return retval
readFromStream = staticmethod(readFromStream) readFromStream = staticmethod(readFromStream)
class TreeObject(DictionaryObject): class TreeObject(DictionaryObject):
def __init__(self): def __init__(self):
DictionaryObject.__init__(self) DictionaryObject.__init__(self)
@ -726,7 +733,6 @@ class TreeObject(DictionaryObject):
found = True found = True
break break
prevRef = curRef prevRef = curRef
prev = cur prev = cur
if NameObject('/Next') in cur: if NameObject('/Next') in cur:
@ -938,6 +944,7 @@ class RectangleObject(ArrayObject):
in (x,y) form. in (x,y) form.
""" """
class Field(TreeObject): class Field(TreeObject):
""" """
A class representing a field dictionary. This class is accessed through A class representing a field dictionary. This class is accessed through
@ -1009,6 +1016,7 @@ class Field(TreeObject):
See Section 8.5.2 of the PDF 1.7 reference. See Section 8.5.2 of the PDF 1.7 reference.
""" """
class Destination(TreeObject): class Destination(TreeObject):
""" """
A class representing a destination within a PDF file. A class representing a destination within a PDF file.
@ -1157,6 +1165,7 @@ def encode_pdfdocencoding(unicode_string):
"does not exist in translation table") "does not exist in translation table")
return retval return retval
def decode_pdfdocencoding(byte_array): def decode_pdfdocencoding(byte_array):
retval = u_('') retval = u_('')
for b in byte_array: for b in byte_array:
@ -1211,4 +1220,3 @@ for i in range(256):
continue continue
assert char not in _pdfDocEncoding_rev assert char not in _pdfDocEncoding_rev
_pdfDocEncoding_rev[char] = i _pdfDocEncoding_rev[char] = i

View file

@ -28,7 +28,7 @@
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
from .generic import * from .generic import *
from .utils import string_type from .utils import isString, str_
from .pdf import PdfFileReader, PdfFileWriter from .pdf import PdfFileReader, PdfFileWriter
from .pagerange import PageRange from .pagerange import PageRange
from sys import version_info from sys import version_info
@ -40,6 +40,7 @@ else:
from io import FileIO as file from io import FileIO as file
StreamIO = BytesIO StreamIO = BytesIO
class _MergedPage(object): class _MergedPage(object):
""" """
_MergedPage is used internally by PdfFileMerger to collect necessary _MergedPage is used internally by PdfFileMerger to collect necessary
@ -50,13 +51,14 @@ class _MergedPage(object):
self.pagedata = pagedata self.pagedata = pagedata
self.out_pagedata = None self.out_pagedata = None
self.id = id self.id = id
class PdfFileMerger(object): class PdfFileMerger(object):
""" """
Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs
into a single PDF. It can concatenate, slice, insert, or any combination into a single PDF. It can concatenate, slice, insert, or any combination
of the above. of the above.
See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`) See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`)
and :meth:`write()<write>` for usage information. and :meth:`write()<write>` for usage information.
@ -64,7 +66,7 @@ class PdfFileMerger(object):
problems and also causes some correctable problems to be fatal. problems and also causes some correctable problems to be fatal.
Defaults to ``True``. Defaults to ``True``.
""" """
def __init__(self, strict=True): def __init__(self, strict=True):
self.inputs = [] self.inputs = []
self.pages = [] self.pages = []
@ -73,7 +75,7 @@ class PdfFileMerger(object):
self.named_dests = [] self.named_dests = []
self.id_count = 0 self.id_count = 0
self.strict = strict self.strict = strict
def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True): def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True):
""" """
Merges the pages from the given file into the output file at the Merges the pages from the given file into the output file at the
@ -85,29 +87,30 @@ class PdfFileMerger(object):
:param fileobj: A File Object or an object that supports the standard read :param fileobj: A File Object or an object that supports the standard read
and seek methods similar to a File Object. Could also be a and seek methods similar to a File Object. Could also be a
string representing a path to a PDF file. string representing a path to a PDF file.
:param str bookmark: Optionally, you may specify a bookmark to be applied at :param str bookmark: Optionally, you may specify a bookmark to be applied at
the beginning of the included file by supplying the text of the bookmark. the beginning of the included file by supplying the text of the bookmark.
:param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple
to merge only the specified range of pages from the source to merge only the specified range of pages from the source
document into the output document. document into the output document.
:param bool import_bookmarks: You may prevent the source document's bookmarks :param bool import_bookmarks: You may prevent the source document's bookmarks
from being imported by specifying this as ``False``. from being imported by specifying this as ``False``.
""" """
# This parameter is passed to self.inputs.append and means # This parameter is passed to self.inputs.append and means
# that the stream used was created in this method. # that the stream used was created in this method.
my_file = False my_file = False
# If the fileobj parameter is a string, assume it is a path # If the fileobj parameter is a string, assume it is a path
# and create a file object at that location. If it is a file, # and create a file object at that location. If it is a file,
# copy the file's contents into a BytesIO (or StreamIO) stream object; if # copy the file's contents into a BytesIO (or StreamIO) stream object; if
# it is a PdfFileReader, copy that reader's stream into a # it is a PdfFileReader, copy that reader's stream into a
# BytesIO (or StreamIO) stream. # BytesIO (or StreamIO) stream.
# If fileobj is none of the above types, it is not modified # If fileobj is none of the above types, it is not modified
if type(fileobj) == string_type: decryption_key = None
if isString(fileobj):
fileobj = file(fileobj, 'rb') fileobj = file(fileobj, 'rb')
my_file = True my_file = True
elif isinstance(fileobj, file): elif isinstance(fileobj, file):
@ -116,17 +119,21 @@ class PdfFileMerger(object):
fileobj = StreamIO(filecontent) fileobj = StreamIO(filecontent)
my_file = True my_file = True
elif isinstance(fileobj, PdfFileReader): elif isinstance(fileobj, PdfFileReader):
orig_tell = fileobj.stream.tell() orig_tell = fileobj.stream.tell()
fileobj.stream.seek(0) fileobj.stream.seek(0)
filecontent = StreamIO(fileobj.stream.read()) filecontent = StreamIO(fileobj.stream.read())
fileobj.stream.seek(orig_tell) # reset the stream to its original location fileobj.stream.seek(orig_tell) # reset the stream to its original location
fileobj = filecontent fileobj = filecontent
if hasattr(fileobj, '_decryption_key'):
decryption_key = fileobj._decryption_key
my_file = True my_file = True
# Create a new PdfFileReader instance using the stream # Create a new PdfFileReader instance using the stream
# (either file or BytesIO or StringIO) created above # (either file or BytesIO or StringIO) created above
pdfr = PdfFileReader(fileobj, strict=self.strict) pdfr = PdfFileReader(fileobj, strict=self.strict)
if decryption_key is not None:
pdfr._decryption_key = decryption_key
# Find the range of pages to merge. # Find the range of pages to merge.
if pages == None: if pages == None:
pages = (0, pdfr.getNumPages()) pages = (0, pdfr.getNumPages())
@ -134,47 +141,45 @@ class PdfFileMerger(object):
pages = pages.indices(pdfr.getNumPages()) pages = pages.indices(pdfr.getNumPages())
elif not isinstance(pages, tuple): elif not isinstance(pages, tuple):
raise TypeError('"pages" must be a tuple of (start, stop[, step])') raise TypeError('"pages" must be a tuple of (start, stop[, step])')
srcpages = [] srcpages = []
if bookmark: if bookmark:
bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit')) bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit'))
outline = [] outline = []
if import_bookmarks: if import_bookmarks:
outline = pdfr.getOutlines() outline = pdfr.getOutlines()
outline = self._trim_outline(pdfr, outline, pages) outline = self._trim_outline(pdfr, outline, pages)
if bookmark: if bookmark:
self.bookmarks += [bookmark, outline] self.bookmarks += [bookmark, outline]
else: else:
self.bookmarks += outline self.bookmarks += outline
dests = pdfr.namedDestinations dests = pdfr.namedDestinations
dests = self._trim_dests(pdfr, dests, pages) dests = self._trim_dests(pdfr, dests, pages)
self.named_dests += dests self.named_dests += dests
# Gather all the pages that are going to be merged # Gather all the pages that are going to be merged
for i in range(*pages): for i in range(*pages):
pg = pdfr.getPage(i) pg = pdfr.getPage(i)
id = self.id_count id = self.id_count
self.id_count += 1 self.id_count += 1
mp = _MergedPage(pg, pdfr, id) mp = _MergedPage(pg, pdfr, id)
srcpages.append(mp) srcpages.append(mp)
self._associate_dests_to_pages(srcpages) self._associate_dests_to_pages(srcpages)
self._associate_bookmarks_to_pages(srcpages) self._associate_bookmarks_to_pages(srcpages)
# Slice to insert the pages at the specified position # Slice to insert the pages at the specified position
self.pages[position:position] = srcpages self.pages[position:position] = srcpages
# Keep track of our input files so we can close them later # Keep track of our input files so we can close them later
self.inputs.append((fileobj, pdfr, my_file)) self.inputs.append((fileobj, pdfr, my_file))
def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True):
""" """
Identical to the :meth:`merge()<merge>` method, but assumes you want to concatenate Identical to the :meth:`merge()<merge>` method, but assumes you want to concatenate
@ -183,7 +188,7 @@ class PdfFileMerger(object):
:param fileobj: A File Object or an object that supports the standard read :param fileobj: A File Object or an object that supports the standard read
and seek methods similar to a File Object. Could also be a and seek methods similar to a File Object. Could also be a
string representing a path to a PDF file. string representing a path to a PDF file.
:param str bookmark: Optionally, you may specify a bookmark to be applied at :param str bookmark: Optionally, you may specify a bookmark to be applied at
the beginning of the included file by supplying the text of the bookmark. the beginning of the included file by supplying the text of the bookmark.
@ -194,10 +199,9 @@ class PdfFileMerger(object):
:param bool import_bookmarks: You may prevent the source document's bookmarks :param bool import_bookmarks: You may prevent the source document's bookmarks
from being imported by specifying this as ``False``. from being imported by specifying this as ``False``.
""" """
self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks) self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks)
def write(self, fileobj): def write(self, fileobj):
""" """
Writes all data that has been merged to the given output file. Writes all data that has been merged to the given output file.
@ -206,11 +210,10 @@ class PdfFileMerger(object):
file-like object. file-like object.
""" """
my_file = False my_file = False
if type(fileobj) in (str, str): if isString(fileobj):
fileobj = file(fileobj, 'wb') fileobj = file(fileobj, 'wb')
my_file = True my_file = True
# Add pages to the PdfFileWriter # Add pages to the PdfFileWriter
# The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13 # The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13
for page in self.pages: for page in self.pages:
@ -222,15 +225,13 @@ class PdfFileMerger(object):
# Once all pages are added, create bookmarks to point at those pages # Once all pages are added, create bookmarks to point at those pages
self._write_dests() self._write_dests()
self._write_bookmarks() self._write_bookmarks()
# Write the output to the file # Write the output to the file
self.output.write(fileobj) self.output.write(fileobj)
if my_file: if my_file:
fileobj.close() fileobj.close()
def close(self): def close(self):
""" """
Shuts all file descriptors (input and output) and clears all memory Shuts all file descriptors (input and output) and clears all memory
@ -240,7 +241,7 @@ class PdfFileMerger(object):
for fo, pdfr, mine in self.inputs: for fo, pdfr, mine in self.inputs:
if mine: if mine:
fo.close() fo.close()
self.inputs = [] self.inputs = []
self.output = None self.output = None
@ -253,7 +254,7 @@ class PdfFileMerger(object):
Example: ``{u'/Title': u'My title'}`` Example: ``{u'/Title': u'My title'}``
""" """
self.output.addMetadata(infos) self.output.addMetadata(infos)
def setPageLayout(self, layout): def setPageLayout(self, layout):
""" """
Set the page layout Set the page layout
@ -289,7 +290,7 @@ class PdfFileMerger(object):
def _trim_dests(self, pdf, dests, pages): def _trim_dests(self, pdf, dests, pages):
""" """
Removes any named destinations that are not a part of the specified Removes any named destinations that are not a part of the specified
page set. page set.
""" """
new_dests = [] new_dests = []
@ -298,14 +299,14 @@ class PdfFileMerger(object):
for j in range(*pages): for j in range(*pages):
if pdf.getPage(j).getObject() == o['/Page'].getObject(): if pdf.getPage(j).getObject() == o['/Page'].getObject():
o[NameObject('/Page')] = o['/Page'].getObject() o[NameObject('/Page')] = o['/Page'].getObject()
assert str(k) == str(o['/Title']) assert str_(k) == str_(o['/Title'])
new_dests.append(o) new_dests.append(o)
break break
return new_dests return new_dests
def _trim_outline(self, pdf, outline, pages): def _trim_outline(self, pdf, outline, pages):
""" """
Removes any outline/bookmark entries that are not a part of the Removes any outline/bookmark entries that are not a part of the
specified page set. specified page set.
""" """
new_outline = [] new_outline = []
@ -326,10 +327,10 @@ class PdfFileMerger(object):
prev_header_added = True prev_header_added = True
break break
return new_outline return new_outline
def _write_dests(self): def _write_dests(self):
dests = self.named_dests dests = self.named_dests
for v in dests: for v in dests:
pageno = None pageno = None
pdf = None pdf = None
@ -342,19 +343,18 @@ class PdfFileMerger(object):
break break
if pageno != None: if pageno != None:
self.output.addNamedDestinationObject(v) self.output.addNamedDestinationObject(v)
def _write_bookmarks(self, bookmarks=None, parent=None): def _write_bookmarks(self, bookmarks=None, parent=None):
if bookmarks == None: if bookmarks == None:
bookmarks = self.bookmarks bookmarks = self.bookmarks
last_added = None last_added = None
for b in bookmarks: for b in bookmarks:
if isinstance(b, list): if isinstance(b, list):
self._write_bookmarks(b, last_added) self._write_bookmarks(b, last_added)
continue continue
pageno = None pageno = None
pdf = None pdf = None
if '/Page' in b: if '/Page' in b:
@ -410,31 +410,31 @@ class PdfFileMerger(object):
del b['/Left'], b['/Right'], b['/Bottom'], b['/Top'] del b['/Left'], b['/Right'], b['/Bottom'], b['/Top']
b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)}) b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)})
pageno = i pageno = i
pdf = p.src pdf = p.src
break break
if pageno != None: if pageno != None:
del b['/Page'], b['/Type'] del b['/Page'], b['/Type']
last_added = self.output.addBookmarkDict(b, parent) last_added = self.output.addBookmarkDict(b, parent)
def _associate_dests_to_pages(self, pages): def _associate_dests_to_pages(self, pages):
for nd in self.named_dests: for nd in self.named_dests:
pageno = None pageno = None
np = nd['/Page'] np = nd['/Page']
if isinstance(np, NumberObject): if isinstance(np, NumberObject):
continue continue
for p in pages: for p in pages:
if np.getObject() == p.pagedata.getObject(): if np.getObject() == p.pagedata.getObject():
pageno = p.id pageno = p.id
if pageno != None: if pageno != None:
nd[NameObject('/Page')] = NumberObject(pageno) nd[NameObject('/Page')] = NumberObject(pageno)
else: else:
raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],)) raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],))
def _associate_bookmarks_to_pages(self, pages, bookmarks=None): def _associate_bookmarks_to_pages(self, pages, bookmarks=None):
if bookmarks == None: if bookmarks == None:
bookmarks = self.bookmarks bookmarks = self.bookmarks
@ -443,35 +443,35 @@ class PdfFileMerger(object):
if isinstance(b, list): if isinstance(b, list):
self._associate_bookmarks_to_pages(pages, b) self._associate_bookmarks_to_pages(pages, b)
continue continue
pageno = None pageno = None
bp = b['/Page'] bp = b['/Page']
if isinstance(bp, NumberObject): if isinstance(bp, NumberObject):
continue continue
for p in pages: for p in pages:
if bp.getObject() == p.pagedata.getObject(): if bp.getObject() == p.pagedata.getObject():
pageno = p.id pageno = p.id
if pageno != None: if pageno != None:
b[NameObject('/Page')] = NumberObject(pageno) b[NameObject('/Page')] = NumberObject(pageno)
else: else:
raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],)) raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],))
def findBookmark(self, bookmark, root=None): def findBookmark(self, bookmark, root=None):
if root == None: if root == None:
root = self.bookmarks root = self.bookmarks
for i, b in enumerate(root): for i, b in enumerate(root):
if isinstance(b, list): if isinstance(b, list):
res = self.findBookmark(bookmark, b) res = self.findBookmark(bookmark, b)
if res: if res:
return [i] + res return [i] + res
elif b == bookmark or b['/Title'] == bookmark: elif b == bookmark or b['/Title'] == bookmark:
return [i] return [i]
return None return None
def addBookmark(self, title, pagenum, parent=None): def addBookmark(self, title, pagenum, parent=None):
""" """
@ -483,28 +483,27 @@ class PdfFileMerger(object):
bookmarks. bookmarks.
""" """
if parent == None: if parent == None:
iloc = [len(self.bookmarks)-1] iloc = [len(self.bookmarks)-1]
elif isinstance(parent, list): elif isinstance(parent, list):
iloc = parent iloc = parent
else: else:
iloc = self.findBookmark(parent) iloc = self.findBookmark(parent)
dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
if parent == None: if parent == None:
self.bookmarks.append(dest) self.bookmarks.append(dest)
else: else:
bmparent = self.bookmarks bmparent = self.bookmarks
for i in iloc[:-1]: for i in iloc[:-1]:
bmparent = bmparent[i] bmparent = bmparent[i]
npos = iloc[-1]+1 npos = iloc[-1]+1
if npos < len(bmparent) and isinstance(bmparent[npos], list): if npos < len(bmparent) and isinstance(bmparent[npos], list):
bmparent[npos].append(dest) bmparent[npos].append(dest)
else: else:
bmparent.insert(npos, [dest]) bmparent.insert(npos, [dest])
return dest return dest
def addNamedDestination(self, title, pagenum): def addNamedDestination(self, title, pagenum):
""" """
Add a destination to the output. Add a destination to the output.
@ -512,7 +511,7 @@ class PdfFileMerger(object):
:param str title: Title to use :param str title: Title to use
:param int pagenum: Page number this destination points at. :param int pagenum: Page number this destination points at.
""" """
dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
self.named_dests.append(dest) self.named_dests.append(dest)
@ -523,12 +522,12 @@ class OutlinesObject(list):
self.tree = tree self.tree = tree
self.pdf = pdf self.pdf = pdf
self.parent = parent self.parent = parent
def remove(self, index): def remove(self, index):
obj = self[index] obj = self[index]
del self[index] del self[index]
self.tree.removeChild(obj) self.tree.removeChild(obj)
def add(self, title, pagenum): def add(self, title, pagenum):
pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum] pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum]
action = DictionaryObject() action = DictionaryObject()
@ -547,7 +546,7 @@ class OutlinesObject(list):
self.pdf._addObject(bookmark) self.pdf._addObject(bookmark)
self.tree.addChild(bookmark) self.tree.addChild(bookmark)
def removeAll(self): def removeAll(self):
for child in [x for x in self.tree.children()]: for child in [x for x in self.tree.children()]:
self.tree.removeChild(child) self.tree.removeChild(child)

View file

@ -8,7 +8,7 @@ see https://github.com/mstamy2/PyPDF2/blob/master/LICENSE
""" """
import re import re
from .utils import Str from .utils import isString
_INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0". _INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0".
PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE) PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE)
@ -32,11 +32,11 @@ PAGE_RANGE_HELP = """Remember, page indices start with zero.
::-1 all pages in reverse order. ::-1 all pages in reverse order.
""" """
class PageRange(object): class PageRange(object):
""" """
A slice-like representation of a range of page indices, A slice-like representation of a range of page indices,
i.e. page numbers, only starting at zero. i.e. page numbers, only starting at zero.
The syntax is like what you would put between brackets [ ]. The syntax is like what you would put between brackets [ ].
The slice is one of the few Python types that can't be subclassed, The slice is one of the few Python types that can't be subclassed,
but this class converts to and from slices, and allows similar use. but this class converts to and from slices, and allows similar use.
@ -46,7 +46,7 @@ class PageRange(object):
o str() and repr() allow printing. o str() and repr() allow printing.
o indices(n) is like slice.indices(n). o indices(n) is like slice.indices(n).
""" """
def __init__(self, arg): def __init__(self, arg):
""" """
Initialize with either a slice -- giving the equivalent page range, Initialize with either a slice -- giving the equivalent page range,
@ -67,8 +67,8 @@ class PageRange(object):
if isinstance(arg, PageRange): if isinstance(arg, PageRange):
self._slice = arg.to_slice() self._slice = arg.to_slice()
return return
m = isinstance(arg, Str) and re.match(PAGE_RANGE_RE, arg) m = isString(arg) and re.match(PAGE_RANGE_RE, arg)
if not m: if not m:
raise ParseError(arg) raise ParseError(arg)
elif m.group(2): elif m.group(2):
@ -77,25 +77,25 @@ class PageRange(object):
stop = start + 1 if start != -1 else None stop = start + 1 if start != -1 else None
self._slice = slice(start, stop) self._slice = slice(start, stop)
else: else:
self._slice = slice(*[int(g) if g else None self._slice = slice(*[int(g) if g else None
for g in m.group(4, 6, 8)]) for g in m.group(4, 6, 8)])
# Just formatting this when there is __doc__ for __init__ # Just formatting this when there is __doc__ for __init__
if __init__.__doc__: if __init__.__doc__:
__init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP) __init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP)
@staticmethod @staticmethod
def valid(input): def valid(input):
""" True if input is a valid initializer for a PageRange. """ """ True if input is a valid initializer for a PageRange. """
return isinstance(input, slice) or \ return isinstance(input, slice) or \
isinstance(input, PageRange) or \ isinstance(input, PageRange) or \
(isinstance(input, Str) (isString(input)
and bool(re.match(PAGE_RANGE_RE, input))) and bool(re.match(PAGE_RANGE_RE, input)))
def to_slice(self): def to_slice(self):
""" Return the slice equivalent of this page range. """ """ Return the slice equivalent of this page range. """
return self._slice return self._slice
def __str__(self): def __str__(self):
""" A string like "1:2:3". """ """ A string like "1:2:3". """
s = self._slice s = self._slice
@ -127,7 +127,7 @@ def parse_filename_page_ranges(args):
""" """
Given a list of filenames and page ranges, return a list of Given a list of filenames and page ranges, return a list of
(filename, page_range) pairs. (filename, page_range) pairs.
First arg must be a filename; other ags are filenames, page-range First arg must be a filename; other ags are filenames, page-range
expressions, slice objects, or PageRange objects. expressions, slice objects, or PageRange objects.
A filename not followed by a page range indicates all pages of the file. A filename not followed by a page range indicates all pages of the file.
""" """
@ -146,7 +146,7 @@ def parse_filename_page_ranges(args):
# New filename or end of list--do all of the previous file? # New filename or end of list--do all of the previous file?
if pdf_filename and not did_page_range: if pdf_filename and not did_page_range:
pairs.append( (pdf_filename, PAGE_RANGE_ALL) ) pairs.append( (pdf_filename, PAGE_RANGE_ALL) )
pdf_filename = arg pdf_filename = arg
did_page_range = False did_page_range = False
return pairs return pairs

View file

@ -63,7 +63,7 @@ import warnings
import codecs import codecs
from .generic import * from .generic import *
from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
from .utils import Str, b_, u_, ord_, chr_, str_, string_type, formatWarning from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning
if version_info < ( 2, 4 ): if version_info < ( 2, 4 ):
from sets import ImmutableSet as frozenset from sets import ImmutableSet as frozenset
@ -74,6 +74,7 @@ else:
from hashlib import md5 from hashlib import md5
import uuid import uuid
class PdfFileWriter(object): class PdfFileWriter(object):
""" """
This class supports writing PDF files out, given pages produced by another This class supports writing PDF files out, given pages produced by another
@ -228,6 +229,157 @@ class PdfFileWriter(object):
NameObject("/OpenAction"): self._addObject(js) NameObject("/OpenAction"): self._addObject(js)
}) })
def addAttachment(self, fname, fdata):
"""
Embed a file inside the PDF.
:param str fname: The filename to display.
:param str fdata: The data in the file.
Reference:
https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
Section 7.11.3
"""
# We need 3 entries:
# * The file's data
# * The /Filespec entry
# * The file's name, which goes in the Catalog
# The entry for the file
""" Sample:
8 0 obj
<<
/Length 12
/Type /EmbeddedFile
>>
stream
Hello world!
endstream
endobj
"""
file_entry = DecodedStreamObject()
file_entry.setData(fdata)
file_entry.update({
NameObject("/Type"): NameObject("/EmbeddedFile")
})
# The Filespec entry
""" Sample:
7 0 obj
<<
/Type /Filespec
/F (hello.txt)
/EF << /F 8 0 R >>
>>
"""
efEntry = DictionaryObject()
efEntry.update({ NameObject("/F"):file_entry })
filespec = DictionaryObject()
filespec.update({
NameObject("/Type"): NameObject("/Filespec"),
NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject
NameObject("/EF"): efEntry
})
# Then create the entry for the root, as it needs a reference to the Filespec
""" Sample:
1 0 obj
<<
/Type /Catalog
/Outlines 2 0 R
/Pages 3 0 R
/Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>
>>
endobj
"""
embeddedFilesNamesDictionary = DictionaryObject()
embeddedFilesNamesDictionary.update({
NameObject("/Names"): ArrayObject([createStringObject(fname), filespec])
})
embeddedFilesDictionary = DictionaryObject()
embeddedFilesDictionary.update({
NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary
})
# Update the root
self._root_object.update({
NameObject("/Names"): embeddedFilesDictionary
})
def appendPagesFromReader(self, reader, after_page_append=None):
"""
Copy pages from reader to writer. Includes an optional callback parameter
which is invoked after pages are appended to the writer.
:param reader: a PdfFileReader object from which to copy page
annotations to this writer object. The writer's annots
will then be updated
:callback after_page_append (function): Callback function that is invoked after
each page is appended to the writer. Callback signature:
:param writer_pageref (PDF page reference): Reference to the page
appended to the writer.
"""
# Get page count from writer and reader
reader_num_pages = reader.getNumPages()
writer_num_pages = self.getNumPages()
# Copy pages from reader to writer
for rpagenum in range(0, reader_num_pages):
reader_page = reader.getPage(rpagenum)
self.addPage(reader_page)
writer_page = self.getPage(writer_num_pages+rpagenum)
# Trigger callback, pass writer page as parameter
if callable(after_page_append): after_page_append(writer_page)
def updatePageFormFieldValues(self, page, fields):
'''
Update the form field values for a given page from a fields dictionary.
Copy field texts and values from fields to page.
:param page: Page reference from PDF writer where the annotations
and field data will be updated.
:param fields: a Python dictionary of field names (/T) and text
values (/V)
'''
# Iterate through pages, update field values
for j in range(0, len(page['/Annots'])):
writer_annot = page['/Annots'][j].getObject()
for field in fields:
if writer_annot.get('/T') == field:
writer_annot.update({
NameObject("/V"): TextStringObject(fields[field])
})
def cloneReaderDocumentRoot(self, reader):
'''
Copy the reader document root to the writer.
:param reader: PdfFileReader from the document root should be copied.
:callback after_page_append
'''
self._root_object = reader.trailer['/Root']
def cloneDocumentFromReader(self, reader, after_page_append=None):
'''
Create a copy (clone) of a document from a PDF file reader
:param reader: PDF file reader instance from which the clone
should be created.
:callback after_page_append (function): Callback function that is invoked after
each page is appended to the writer. Signature includes a reference to the
appended page (delegates to appendPagesFromReader). Callback signature:
:param writer_pageref (PDF page reference): Reference to the page just
appended to the document.
'''
self.cloneReaderDocumentRoot(reader)
self.appendPagesFromReader(reader, after_page_append)
def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
""" """
Encrypt this PDF file with the PDF Standard encryption handler. Encrypt this PDF file with the PDF Standard encryption handler.
@ -516,7 +668,6 @@ class PdfFileWriter(object):
return bookmarkRef return bookmarkRef
def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args): def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args):
""" """
Add a bookmark to this PDF file. Add a bookmark to this PDF file.
@ -553,7 +704,6 @@ class PdfFileWriter(object):
if parent == None: if parent == None:
parent = outlineRef parent = outlineRef
bookmark = TreeObject() bookmark = TreeObject()
bookmark.update({ bookmark.update({
@ -759,7 +909,7 @@ class PdfFileWriter(object):
else: else:
borderArr = [NumberObject(0)] * 3 borderArr = [NumberObject(0)] * 3
if isinstance(rect, Str): if isString(rect):
rect = NameObject(rect) rect = NameObject(rect)
elif isinstance(rect, RectangleObject): elif isinstance(rect, RectangleObject):
pass pass
@ -871,6 +1021,7 @@ class PdfFileWriter(object):
"""Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>` """Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>`
and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods.""" and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods."""
class PdfFileReader(object): class PdfFileReader(object):
""" """
Initializes a PdfFileReader object. This operation can take some time, as Initializes a PdfFileReader object. This operation can take some time, as
@ -904,9 +1055,10 @@ class PdfFileReader(object):
self.flattenedPages = None self.flattenedPages = None
self.resolvedObjects = {} self.resolvedObjects = {}
self.xrefIndex = 0 self.xrefIndex = 0
self._pageId2Num = None # map page IndirectRef number to Page Number
if hasattr(stream, 'mode') and 'b' not in stream.mode: if hasattr(stream, 'mode') and 'b' not in stream.mode:
warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning) warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning)
if type(stream) in (string_type, str): if isString(stream):
fileobj = open(stream, 'rb') fileobj = open(stream, 'rb')
stream = BytesIO(b_(fileobj.read())) stream = BytesIO(b_(fileobj.read()))
fileobj.close() fileobj.close()
@ -973,6 +1125,7 @@ class PdfFileReader(object):
if self.isEncrypted: if self.isEncrypted:
try: try:
self._override_encryption = True self._override_encryption = True
self.decrypt('')
return self.trailer["/Root"]["/Pages"]["/Count"] return self.trailer["/Root"]["/Pages"]["/Count"]
except: except:
raise utils.PdfReadError("File has not been decrypted") raise utils.PdfReadError("File has not been decrypted")
@ -1160,7 +1313,14 @@ class PdfFileReader(object):
# get the outline dictionary and named destinations # get the outline dictionary and named destinations
if "/Outlines" in catalog: if "/Outlines" in catalog:
lines = catalog["/Outlines"] try:
lines = catalog["/Outlines"]
except utils.PdfReadError:
# this occurs if the /Outlines object reference is incorrect
# for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf
# so continue to load the file without the Bookmarks
return outlines
if "/First" in lines: if "/First" in lines:
node = lines["/First"] node = lines["/First"]
self._namedDests = self.getNamedDestinations() self._namedDests = self.getNamedDestinations()
@ -1187,6 +1347,49 @@ class PdfFileReader(object):
return outlines return outlines
def _getPageNumberByIndirect(self, indirectRef):
"""Generate _pageId2Num"""
if self._pageId2Num is None:
id2num = {}
for i, x in enumerate(self.pages):
id2num[x.indirectRef.idnum] = i
self._pageId2Num = id2num
if isinstance(indirectRef, int):
idnum = indirectRef
else:
idnum = indirectRef.idnum
ret = self._pageId2Num.get(idnum, -1)
return ret
def getPageNumber(self, page):
"""
Retrieve page number of a given PageObject
:param PageObject page: The page to get page number. Should be
an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
:return: the page number or -1 if page not found
:rtype: int
"""
indirectRef = page.indirectRef
ret = self._getPageNumberByIndirect(indirectRef)
return ret
def getDestinationPageNumber(self, destination):
"""
Retrieve page number of a given Destination object
:param Destination destination: The destination to get page number.
Should be an instance of
:class:`Destination<PyPDF2.pdf.Destination>`
:return: the page number or -1 if page not found
:rtype: int
"""
indirectRef = destination.page
ret = self._getPageNumberByIndirect(indirectRef)
return ret
def _buildDestination(self, title, array): def _buildDestination(self, title, array):
page, typ = array[0:2] page, typ = array[0:2]
array = array[2:] array = array[2:]
@ -1210,7 +1413,7 @@ class PdfFileReader(object):
if dest: if dest:
if isinstance(dest, ArrayObject): if isinstance(dest, ArrayObject):
outline = self._buildDestination(title, dest) outline = self._buildDestination(title, dest)
elif isinstance(dest, Str) and dest in self._namedDests: elif isString(dest) and dest in self._namedDests:
outline = self._namedDests[dest] outline = self._namedDests[dest]
outline[NameObject("/Title")] = title outline[NameObject("/Title")] = title
else: else:
@ -1310,6 +1513,8 @@ class PdfFileReader(object):
assert idx < objStm['/N'] assert idx < objStm['/N']
streamData = BytesIO(b_(objStm.getData())) streamData = BytesIO(b_(objStm.getData()))
for i in range(objStm['/N']): for i in range(objStm['/N']):
readNonWhitespace(streamData)
streamData.seek(-1, 1)
objnum = NumberObject.readFromStream(streamData) objnum = NumberObject.readFromStream(streamData)
readNonWhitespace(streamData) readNonWhitespace(streamData)
streamData.seek(-1, 1) streamData.seek(-1, 1)
@ -1347,7 +1552,6 @@ class PdfFileReader(object):
if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.") if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.")
return NullObject() return NullObject()
def getObject(self, indirectReference): def getObject(self, indirectReference):
debug = False debug = False
if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation)) if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation))
@ -1470,7 +1674,7 @@ class PdfFileReader(object):
startxref = int(line) startxref = int(line)
except ValueError: except ValueError:
# 'startxref' may be on the same line as the location # 'startxref' may be on the same line as the location
if not line.startswith("startxref"): if not line.startswith(b_("startxref")):
raise utils.PdfReadError("startxref not found") raise utils.PdfReadError("startxref not found")
startxref = int(line[9:].strip()) startxref = int(line[9:].strip())
warnings.warn("startxref on same line as offset") warnings.warn("startxref on same line as offset")
@ -1580,6 +1784,7 @@ class PdfFileReader(object):
assert len(entrySizes) >= 3 assert len(entrySizes) >= 3
if self.strict and len(entrySizes) > 3: if self.strict and len(entrySizes) > 3:
raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes) raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes)
def getEntry(i): def getEntry(i):
# Reads the correct number of bytes for each entry. See the # Reads the correct number of bytes for each entry. See the
# discussion of the W parameter in PDF spec table 17. # discussion of the W parameter in PDF spec table 17.
@ -1664,8 +1869,7 @@ class PdfFileReader(object):
if found: if found:
continue continue
# no xref table found at specified location # no xref table found at specified location
assert False raise utils.PdfReadError("Could not find xref table at specified location")
break
#if not zero-indexed, verify that the table is correct; change it if necessary #if not zero-indexed, verify that the table is correct; change it if necessary
if self.xrefIndex and not self.strict: if self.xrefIndex and not self.strict:
loc = stream.tell() loc = stream.tell()
@ -1683,7 +1887,6 @@ class PdfFileReader(object):
#if not, then either it's just plain wrong, or the non-zero-index is actually correct #if not, then either it's just plain wrong, or the non-zero-index is actually correct
stream.seek(loc, 0) #return to where it was stream.seek(loc, 0) #return to where it was
def _zeroXref(self, generation): def _zeroXref(self, generation):
self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) ) self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) )
@ -1700,8 +1903,13 @@ class PdfFileReader(object):
if debug: print(">>readNextEndLine") if debug: print(">>readNextEndLine")
line = b_("") line = b_("")
while True: while True:
# Prevent infinite loops in malformed PDFs
if stream.tell() == 0:
raise utils.PdfReadError("Could not read malformed PDF file")
x = stream.read(1) x = stream.read(1)
if debug: print((" x:", x, "%x"%ord(x))) if debug: print((" x:", x, "%x"%ord(x)))
if stream.tell() < 2:
raise utils.PdfReadError("EOL marker not found")
stream.seek(-2, 1) stream.seek(-2, 1)
if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR
crlf = False crlf = False
@ -1713,6 +1921,8 @@ class PdfFileReader(object):
if x == b_('\n') or x == b_('\r'): # account for CR+LF if x == b_('\n') or x == b_('\r'): # account for CR+LF
stream.seek(-1, 1) stream.seek(-1, 1)
crlf = True crlf = True
if stream.tell() < 2:
raise utils.PdfReadError("EOL marker not found")
stream.seek(-2, 1) stream.seek(-2, 1)
stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1 stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1
break break
@ -1827,14 +2037,17 @@ def getRectangle(self, name, defaults):
setRectangle(self, name, retval) setRectangle(self, name, retval)
return retval return retval
def setRectangle(self, name, value): def setRectangle(self, name, value):
if not isinstance(name, NameObject): if not isinstance(name, NameObject):
name = NameObject(name) name = NameObject(name)
self[name] = value self[name] = value
def deleteRectangle(self, name): def deleteRectangle(self, name):
del self[name] del self[name]
def createRectangleAccessor(name, fallback): def createRectangleAccessor(name, fallback):
return \ return \
property( property(
@ -1843,6 +2056,7 @@ def createRectangleAccessor(name, fallback):
lambda self: deleteRectangle(self, name) lambda self: deleteRectangle(self, name)
) )
class PageObject(DictionaryObject): class PageObject(DictionaryObject):
""" """
This class represents a single page within a PDF file. Typically this This class represents a single page within a PDF file. Typically this
@ -2374,6 +2588,7 @@ class PageObject(DictionaryObject):
for i in operands[0]: for i in operands[0]:
if isinstance(i, TextStringObject): if isinstance(i, TextStringObject):
text += i text += i
text += "\n"
return text return text
mediaBox = createRectangleAccessor("/MediaBox", ()) mediaBox = createRectangleAccessor("/MediaBox", ())
@ -2412,6 +2627,7 @@ class PageObject(DictionaryObject):
page's creator. page's creator.
""" """
class ContentStream(DecodedStreamObject): class ContentStream(DecodedStreamObject):
def __init__(self, stream, pdf): def __init__(self, stream, pdf):
self.pdf = pdf self.pdf = pdf
@ -2437,25 +2653,25 @@ class ContentStream(DecodedStreamObject):
if peek == b_('') or ord_(peek) == 0: if peek == b_('') or ord_(peek) == 0:
break break
stream.seek(-1, 1) stream.seek(-1, 1)
if peek.isalpha() or peek == "'" or peek == '"': if peek.isalpha() or peek == b_("'") or peek == b_('"'):
operator = utils.readUntilRegex(stream, operator = utils.readUntilRegex(stream,
NameObject.delimiterPattern, True) NameObject.delimiterPattern, True)
if operator == "BI": if operator == b_("BI"):
# begin inline image - a completely different parsing # begin inline image - a completely different parsing
# mechanism is required, of course... thanks buddy... # mechanism is required, of course... thanks buddy...
assert operands == [] assert operands == []
ii = self._readInlineImage(stream) ii = self._readInlineImage(stream)
self.operations.append((ii, "INLINE IMAGE")) self.operations.append((ii, b_("INLINE IMAGE")))
else: else:
self.operations.append((operands, operator)) self.operations.append((operands, operator))
operands = [] operands = []
elif peek == '%': elif peek == b_('%'):
# If we encounter a comment in the content stream, we have to # If we encounter a comment in the content stream, we have to
# handle it here. Typically, readObject will handle # handle it here. Typically, readObject will handle
# encountering a comment -- but readObject assumes that # encountering a comment -- but readObject assumes that
# following the comment must be the object we're trying to # following the comment must be the object we're trying to
# read. In this case, it could be an operator instead. # read. In this case, it could be an operator instead.
while peek not in ('\r', '\n'): while peek not in (b_('\r'), b_('\n')):
peek = stream.read(1) peek = stream.read(1)
else: else:
operands.append(readObject(stream, None)) operands.append(readObject(stream, None))
@ -2467,7 +2683,7 @@ class ContentStream(DecodedStreamObject):
while True: while True:
tok = readNonWhitespace(stream) tok = readNonWhitespace(stream)
stream.seek(-1, 1) stream.seek(-1, 1)
if tok == "I": if tok == b_("I"):
# "ID" - begin of image data # "ID" - begin of image data
break break
key = readObject(stream, self.pdf) key = readObject(stream, self.pdf)
@ -2477,28 +2693,32 @@ class ContentStream(DecodedStreamObject):
settings[key] = value settings[key] = value
# left at beginning of ID # left at beginning of ID
tmp = stream.read(3) tmp = stream.read(3)
assert tmp[:2] == "ID" assert tmp[:2] == b_("ID")
data = "" data = b_("")
while True: while True:
# Read the inline image, while checking for EI (End Image) operator.
tok = stream.read(1) tok = stream.read(1)
if tok == "E": if tok == b_("E"):
# Check for End Image # Check for End Image
next1 = stream.read(1) tok2 = stream.read(1)
if next1 == "I": if tok2 == b_("I"):
next2 = readNonWhitespace(stream) # Sometimes that data will contain EI, so check for the Q operator.
if next2 == 'Q': tok3 = stream.read(1)
info = tok + tok2
while tok3 in utils.WHITESPACES:
info += tok3
tok3 = stream.read(1)
if tok3 == b_("Q"):
stream.seek(-1, 1) stream.seek(-1, 1)
break break
else: else:
stream.seek(-2, 1) stream.seek(-1,1)
data += tok data += info
else: else:
stream.seek(-1, 1) stream.seek(-1, 1)
data += tok data += tok
else: else:
data += tok data += tok
x = readNonWhitespace(stream)
stream.seek(-1, 1)
return {"settings": settings, "data": data} return {"settings": settings, "data": data}
def _getData(self): def _getData(self):
@ -2525,6 +2745,7 @@ class ContentStream(DecodedStreamObject):
_data = property(_getData, _setData) _data = property(_getData, _setData)
class DocumentInformation(DictionaryObject): class DocumentInformation(DictionaryObject):
""" """
A class representing the basic document metadata provided in a PDF File. A class representing the basic document metadata provided in a PDF File.
@ -2588,6 +2809,7 @@ class DocumentInformation(DictionaryObject):
producer_raw = property(lambda self: self.get("/Producer")) producer_raw = property(lambda self: self.get("/Producer"))
"""The "raw" version of producer; can return a ``ByteStringObject``.""" """The "raw" version of producer; can return a ``ByteStringObject``."""
def convertToInt(d, size): def convertToInt(d, size):
if size > 8: if size > 8:
raise utils.PdfReadError("invalid size in convertToInt") raise utils.PdfReadError("invalid size in convertToInt")
@ -2600,6 +2822,7 @@ _encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \
b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \ b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \
b_('\xa9\xfe\x64\x53\x69\x7a') b_('\xa9\xfe\x64\x53\x69\x7a')
# Implementation of algorithm 3.2 of the PDF standard security handler, # Implementation of algorithm 3.2 of the PDF standard security handler,
# section 3.5.2 of the PDF 1.6 reference. # section 3.5.2 of the PDF 1.6 reference.
def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
@ -2643,6 +2866,7 @@ def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
# entry. # entry.
return md5_hash[:keylen] return md5_hash[:keylen]
# Implementation of algorithm 3.3 of the PDF standard security handler, # Implementation of algorithm 3.3 of the PDF standard security handler,
# section 3.5.2 of the PDF 1.6 reference. # section 3.5.2 of the PDF 1.6 reference.
def _alg33(owner_pwd, user_pwd, rev, keylen): def _alg33(owner_pwd, user_pwd, rev, keylen):
@ -2670,6 +2894,7 @@ def _alg33(owner_pwd, user_pwd, rev, keylen):
# the /O entry in the encryption dictionary. # the /O entry in the encryption dictionary.
return val return val
# Steps 1-4 of algorithm 3.3 # Steps 1-4 of algorithm 3.3
def _alg33_1(password, rev, keylen): def _alg33_1(password, rev, keylen):
# 1. Pad or truncate the owner password string as described in step 1 of # 1. Pad or truncate the owner password string as described in step 1 of
@ -2692,6 +2917,7 @@ def _alg33_1(password, rev, keylen):
key = md5_hash[:keylen] key = md5_hash[:keylen]
return key return key
# Implementation of algorithm 3.4 of the PDF standard security handler, # Implementation of algorithm 3.4 of the PDF standard security handler,
# section 3.5.2 of the PDF 1.6 reference. # section 3.5.2 of the PDF 1.6 reference.
def _alg34(password, owner_entry, p_entry, id1_entry): def _alg34(password, owner_entry, p_entry, id1_entry):
@ -2706,6 +2932,7 @@ def _alg34(password, owner_entry, p_entry, id1_entry):
# encryption dictionary. # encryption dictionary.
return U, key return U, key
# Implementation of algorithm 3.4 of the PDF standard security handler, # Implementation of algorithm 3.4 of the PDF standard security handler,
# section 3.5.2 of the PDF 1.6 reference. # section 3.5.2 of the PDF 1.6 reference.
def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):

View file

@ -33,25 +33,35 @@ __author_email__ = "biziqe@mathieu.fenniak.net"
import sys import sys
# "Str" maintains compatibility with Python 2.x.
# The next line is obfuscated like this so 2to3 won't change it.
try: try:
import __builtin__ as builtins import __builtin__ as builtins
except ImportError: # Py3 except ImportError: # Py3
import builtins import builtins
if sys.version_info[0] < 3: xrange_fn = getattr(builtins, "xrange", range)
string_type = unicode _basestring = getattr(builtins, "basestring", str)
bytes_type = str
int_types = (int, long)
else:
string_type = str
bytes_type = bytes
int_types = (int,)
Xrange = getattr(builtins, "xrange", range) bytes_type = type(bytes()) # Works the same in Python 2.X and 3.X
Str = getattr(builtins, "basestring", str) string_type = getattr(builtins, "unicode", str)
int_types = (int, long) if sys.version_info[0] < 3 else (int,)
# Make basic type tests more consistent
def isString(s):
"""Test if arg is a string. Compatible with Python 2 and 3."""
return isinstance(s, _basestring)
def isInt(n):
"""Test if arg is an int. Compatible with Python 2 and 3."""
return isinstance(n, int_types)
def isBytes(b):
"""Test if arg is a bytes instance. Compatible with Python 2 and 3."""
return isinstance(b, bytes_type)
#custom implementation of warnings.formatwarning #custom implementation of warnings.formatwarning
@ -59,6 +69,7 @@ def formatWarning(message, category, filename, lineno, line=None):
file = filename.replace("/", "\\").rsplit("\\", 1)[1] # find the file name file = filename.replace("/", "\\").rsplit("\\", 1)[1] # find the file name
return "%s: %s [%s:%s]\n" % (category.__name__, message, file, lineno) return "%s: %s [%s:%s]\n" % (category.__name__, message, file, lineno)
def readUntilWhitespace(stream, maxchars=None): def readUntilWhitespace(stream, maxchars=None):
""" """
Reads non-whitespace characters and returns them. Reads non-whitespace characters and returns them.
@ -74,6 +85,7 @@ def readUntilWhitespace(stream, maxchars=None):
break break
return txt return txt
def readNonWhitespace(stream): def readNonWhitespace(stream):
""" """
Finds and reads the next non-whitespace character (ignores whitespace). Finds and reads the next non-whitespace character (ignores whitespace).
@ -83,6 +95,7 @@ def readNonWhitespace(stream):
tok = stream.read(1) tok = stream.read(1)
return tok return tok
def skipOverWhitespace(stream): def skipOverWhitespace(stream):
""" """
Similar to readNonWhitespace, but returns a Boolean if more than Similar to readNonWhitespace, but returns a Boolean if more than
@ -95,6 +108,7 @@ def skipOverWhitespace(stream):
cnt+=1 cnt+=1
return (cnt > 1) return (cnt > 1)
def skipOverComment(stream): def skipOverComment(stream):
tok = stream.read(1) tok = stream.read(1)
stream.seek(-1, 1) stream.seek(-1, 1)
@ -102,6 +116,7 @@ def skipOverComment(stream):
while tok not in (b_('\n'), b_('\r')): while tok not in (b_('\n'), b_('\r')):
tok = stream.read(1) tok = stream.read(1)
def readUntilRegex(stream, regex, ignore_eof=False): def readUntilRegex(stream, regex, ignore_eof=False):
""" """
Reads until the regular expression pattern matched (ignore the match) Reads until the regular expression pattern matched (ignore the match)
@ -125,6 +140,7 @@ def readUntilRegex(stream, regex, ignore_eof=False):
name += tok name += tok
return name return name
class ConvertFunctionsToVirtualList(object): class ConvertFunctionsToVirtualList(object):
def __init__(self, lengthFunction, getFunction): def __init__(self, lengthFunction, getFunction):
self.lengthFunction = lengthFunction self.lengthFunction = lengthFunction
@ -135,10 +151,10 @@ class ConvertFunctionsToVirtualList(object):
def __getitem__(self, index): def __getitem__(self, index):
if isinstance(index, slice): if isinstance(index, slice):
indices = Xrange(*index.indices(len(self))) indices = xrange_fn(*index.indices(len(self)))
cls = type(self) cls = type(self)
return cls(indices.__len__, lambda idx: self[indices[idx]]) return cls(indices.__len__, lambda idx: self[indices[idx]])
if not isinstance(index, int_types): if not isInt(index):
raise TypeError("sequence indices must be integers") raise TypeError("sequence indices must be integers")
len_self = len(self) len_self = len(self)
if index < 0: if index < 0:
@ -148,6 +164,7 @@ class ConvertFunctionsToVirtualList(object):
raise IndexError("sequence index out of range") raise IndexError("sequence index out of range")
return self.getFunction(index) return self.getFunction(index)
def RC4_encrypt(key, plaintext): def RC4_encrypt(key, plaintext):
S = [i for i in range(256)] S = [i for i in range(256)]
j = 0 j = 0
@ -164,12 +181,14 @@ def RC4_encrypt(key, plaintext):
retval += b_(chr(ord_(plaintext[x]) ^ t)) retval += b_(chr(ord_(plaintext[x]) ^ t))
return retval return retval
def matrixMultiply(a, b): def matrixMultiply(a, b):
return [[sum([float(i)*float(j) return [[sum([float(i)*float(j)
for i, j in zip(row, col)] for i, j in zip(row, col)]
) for col in zip(*b)] ) for col in zip(*b)]
for row in a] for row in a]
def markLocation(stream): def markLocation(stream):
"""Creates text file showing current location in context.""" """Creates text file showing current location in context."""
# Mainly for debugging # Mainly for debugging
@ -182,18 +201,23 @@ def markLocation(stream):
outputDoc.close() outputDoc.close()
stream.seek(-RADIUS, 1) stream.seek(-RADIUS, 1)
class PyPdfError(Exception): class PyPdfError(Exception):
pass pass
class PdfReadError(PyPdfError): class PdfReadError(PyPdfError):
pass pass
class PageSizeNotDefinedError(PyPdfError): class PageSizeNotDefinedError(PyPdfError):
pass pass
class PdfReadWarning(UserWarning): class PdfReadWarning(UserWarning):
pass pass
class PdfStreamError(PdfReadError): class PdfStreamError(PdfReadError):
pass pass
@ -203,6 +227,7 @@ if sys.version_info[0] < 3:
return s return s
else: else:
B_CACHE = {} B_CACHE = {}
def b_(s): def b_(s):
bc = B_CACHE bc = B_CACHE
if s in bc: if s in bc:
@ -214,6 +239,8 @@ else:
if len(s) < 2: if len(s) < 2:
bc[s] = r bc[s] = r
return r return r
def u_(s): def u_(s):
if sys.version_info[0] < 3: if sys.version_info[0] < 3:
return unicode(s, 'unicode_escape') return unicode(s, 'unicode_escape')
@ -230,24 +257,28 @@ def str_(b):
else: else:
return b return b
def ord_(b): def ord_(b):
if sys.version_info[0] < 3 or type(b) == str: if sys.version_info[0] < 3 or type(b) == str:
return ord(b) return ord(b)
else: else:
return b return b
def chr_(c): def chr_(c):
if sys.version_info[0] < 3: if sys.version_info[0] < 3:
return c return c
else: else:
return chr(c) return chr(c)
def barray(b): def barray(b):
if sys.version_info[0] < 3: if sys.version_info[0] < 3:
return b return b
else: else:
return bytearray(b) return bytearray(b)
def hexencode(b): def hexencode(b):
if sys.version_info[0] < 3: if sys.version_info[0] < 3:
return b.encode('hex') return b.encode('hex')
@ -256,6 +287,7 @@ def hexencode(b):
coder = codecs.getencoder('hex_codec') coder = codecs.getencoder('hex_codec')
return coder(b)[0] return coder(b)[0]
def hexStr(num): def hexStr(num):
return hex(num).replace('L', '') return hex(num).replace('L', '')

View file

@ -50,6 +50,7 @@ iso8601 = re.compile("""
)? )?
""", re.VERBOSE) """, re.VERBOSE)
class XmpInformation(PdfObject): class XmpInformation(PdfObject):
""" """
An object that represents Adobe XMP metadata. An object that represents Adobe XMP metadata.
@ -355,5 +356,3 @@ class XmpInformation(PdfObject):
:return: a dictionary of key/value items for custom metadata properties. :return: a dictionary of key/value items for custom metadata properties.
:rtype: dict :rtype: dict
""" """