# vim: sw=4:expandtab:foldmethod=marker # # Copyright (c) 2006, Mathieu Fenniak # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. """ Implementation of generic PDF objects (dictionary, number, string, and so on) """ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" import re from .utils import readNonWhitespace, RC4_encrypt, skipOverComment from .utils import b_, u_, chr_, ord_ from .utils import PdfStreamError import warnings from . import filters from . import utils import decimal import codecs import sys #import debugging ObjectPrefix = b_('/<[tf(n%') NumberSigns = b_('+-') IndirectPattern = re.compile(b_(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]")) def readObject(stream, pdf): tok = stream.read(1) stream.seek(-1, 1) # reset to start idx = ObjectPrefix.find(tok) if idx == 0: # name object return NameObject.readFromStream(stream, pdf) elif idx == 1: # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start if peek == b_('<<'): return DictionaryObject.readFromStream(stream, pdf) else: return readHexStringFromStream(stream) elif idx == 2: # array object return ArrayObject.readFromStream(stream, pdf) elif idx == 3 or idx == 4: # boolean object return BooleanObject.readFromStream(stream) elif idx == 5: # string object return readStringFromStream(stream) elif idx == 6: # null object return NullObject.readFromStream(stream) elif idx == 7: # comment while tok not in (b_('\r'), b_('\n')): tok = stream.read(1) tok = readNonWhitespace(stream) stream.seek(-1, 1) return readObject(stream, pdf) else: # number object OR indirect reference if tok in NumberSigns: # number return NumberObject.readFromStream(stream) peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if IndirectPattern.match(peek) != None: return IndirectObject.readFromStream(stream, pdf) else: return NumberObject.readFromStream(stream) class PdfObject(object): def getObject(self): """Resolves indirect references.""" return self class NullObject(PdfObject): def writeToStream(self, stream, encryption_key): stream.write(b_("null")) def readFromStream(stream): nulltxt = stream.read(4) if nulltxt != b_("null"): raise utils.PdfReadError("Could not read Null object") return NullObject() readFromStream = staticmethod(readFromStream) class BooleanObject(PdfObject): def __init__(self, value): self.value = value def writeToStream(self, stream, encryption_key): if self.value: stream.write(b_("true")) else: stream.write(b_("false")) def readFromStream(stream): word = stream.read(4) if word == b_("true"): return BooleanObject(True) elif word == b_("fals"): stream.read(1) return BooleanObject(False) else: raise utils.PdfReadError('Could not read Boolean object') readFromStream = staticmethod(readFromStream) class ArrayObject(list, PdfObject): def writeToStream(self, stream, encryption_key): stream.write(b_("[")) for data in self: stream.write(b_(" ")) data.writeToStream(stream, encryption_key) stream.write(b_(" ]")) def readFromStream(stream, pdf): arr = ArrayObject() tmp = stream.read(1) if tmp != b_("["): raise utils.PdfReadError("Could not read array") while True: # skip leading whitespace tok = stream.read(1) while tok.isspace(): tok = stream.read(1) stream.seek(-1, 1) # check for array ending peekahead = stream.read(1) if peekahead == b_("]"): break stream.seek(-1, 1) # read and append obj arr.append(readObject(stream, pdf)) return arr readFromStream = staticmethod(readFromStream) class IndirectObject(PdfObject): def __init__(self, idnum, generation, pdf): self.idnum = idnum self.generation = generation self.pdf = pdf def getObject(self): return self.pdf.getObject(self).getObject() def __repr__(self): return "IndirectObject(%r, %r)" % (self.idnum, self.generation) def __eq__(self, other): return ( other != None and isinstance(other, IndirectObject) and self.idnum == other.idnum and self.generation == other.generation and self.pdf is other.pdf ) def __ne__(self, other): return not self.__eq__(other) def writeToStream(self, stream, encryption_key): stream.write(b_("%s %s R" % (self.idnum, self.generation))) def readFromStream(stream, pdf): idnum = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): break idnum += tok generation = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): if not generation: continue break generation += tok r = readNonWhitespace(stream) if r != b_("R"): raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) return IndirectObject(int(idnum), int(generation), pdf) readFromStream = staticmethod(readFromStream) class FloatObject(decimal.Decimal, PdfObject): def __new__(cls, value="0", context=None): try: return decimal.Decimal.__new__(cls, utils.str_(value), context) except: return decimal.Decimal.__new__(cls, str(value)) def __repr__(self): if self == self.to_integral(): return str(self.quantize(decimal.Decimal(1))) else: # XXX: this adds useless extraneous zeros. return "%.5f" % self def as_numeric(self): return float(b_(repr(self))) def writeToStream(self, stream, encryption_key): stream.write(b_(repr(self))) class NumberObject(int, PdfObject): NumberPattern = re.compile(b_('[^+-.0-9]')) ByteDot = b_(".") def __new__(cls, value): val = int(value) try: return int.__new__(cls, val) except OverflowError: return int.__new__(cls, 0) def as_numeric(self): return int(b_(repr(self))) def writeToStream(self, stream, encryption_key): stream.write(b_(repr(self))) def readFromStream(stream): num = utils.readUntilRegex(stream, NumberObject.NumberPattern) if num.find(NumberObject.ByteDot) != -1: return FloatObject(num) else: return NumberObject(num) readFromStream = staticmethod(readFromStream) ## # Given a string (either a "str" or "unicode"), create a ByteStringObject or a # TextStringObject to represent the string. def createStringObject(string): if isinstance(string, utils.string_type): return TextStringObject(string) elif isinstance(string, utils.bytes_type): try: if string.startswith(codecs.BOM_UTF16_BE): retval = TextStringObject(string.decode("utf-16")) retval.autodetect_utf16 = True return retval else: # This is probably a big performance hit here, but we need to # convert string objects into the text/unicode-aware version if # possible... and the only way to check if that's possible is # to try. Some strings are strings, some are just byte arrays. retval = TextStringObject(decode_pdfdocencoding(string)) retval.autodetect_pdfdocencoding = True return retval except UnicodeDecodeError: return ByteStringObject(string) else: raise TypeError("createStringObject should have str or unicode arg") def readHexStringFromStream(stream): stream.read(1) txt = "" x = b_("") while True: tok = readNonWhitespace(stream) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok == b_(">"): break x += tok if len(x) == 2: txt += chr(int(x, base=16)) x = b_("") if len(x) == 1: x += b_("0") if len(x) == 2: txt += chr(int(x, base=16)) return createStringObject(b_(txt)) def readStringFromStream(stream): tok = stream.read(1) parens = 1 txt = b_("") while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok == b_("("): parens += 1 elif tok == b_(")"): parens -= 1 if parens == 0: break elif tok == b_("\\"): tok = stream.read(1) if tok == b_("n"): tok = b_("\n") elif tok == b_("r"): tok = b_("\r") elif tok == b_("t"): tok = b_("\t") elif tok == b_("b"): tok = b_("\b") elif tok == b_("f"): tok = b_("\f") elif tok == b_("c"): tok = b_("\c") elif tok == b_("("): tok = b_("(") elif tok == b_(")"): tok = b_(")") elif tok == b_("/"): tok = b_("/") elif tok == b_("\\"): tok = b_("\\") elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["), b_("]"), b_("#"), b_("_"), b_("&"), b_('$')): # odd/unnessecary escape sequences we have encountered tok = b_(tok) elif tok.isdigit(): # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) for i in range(2): ntok = stream.read(1) if ntok.isdigit(): tok += ntok else: break tok = b_(chr(int(tok, base=8))) elif tok in b_("\n\r"): # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) if not tok in b_("\n\r"): stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: tok = b_('') else: raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok) txt += tok return createStringObject(txt) ## # Represents a string object where the text encoding could not be determined. # This occurs quite often, as the PDF spec doesn't provide an alternate way to # represent strings -- for example, the encryption data stored in files (like # /O) is clearly not text, but is still stored in a "String" object. class ByteStringObject(utils.bytes_type, PdfObject): ## # For compatibility with TextStringObject.original_bytes. This method # returns self. original_bytes = property(lambda self: self) def writeToStream(self, stream, encryption_key): bytearr = self if encryption_key: bytearr = RC4_encrypt(encryption_key, bytearr) stream.write(b_("<")) stream.write(utils.hexencode(bytearr)) stream.write(b_(">")) ## # Represents a string object that has been decoded into a real unicode string. # If read from a PDF document, this string appeared to match the # PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to # occur. class TextStringObject(utils.string_type, PdfObject): autodetect_pdfdocencoding = False autodetect_utf16 = False ## # It is occasionally possible that a text string object gets created where # a byte string object was expected due to the autodetection mechanism -- # if that occurs, this "original_bytes" property can be used to # back-calculate what the original encoded bytes were. original_bytes = property(lambda self: self.get_original_bytes()) def get_original_bytes(self): # We're a text string object, but the library is trying to get our raw # bytes. This can happen if we auto-detected this string as text, but # we were wrong. It's pretty common. Return the original bytes that # would have been used to create this object, based upon the autodetect # method. if self.autodetect_utf16: return codecs.BOM_UTF16_BE + self.encode("utf-16be") elif self.autodetect_pdfdocencoding: return encode_pdfdocencoding(self) else: raise Exception("no information about original bytes") def writeToStream(self, stream, encryption_key): # Try to write the string out as a PDFDocEncoding encoded string. It's # nicer to look at in the PDF file. Sadly, we take a performance hit # here for trying... try: bytearr = encode_pdfdocencoding(self) except UnicodeEncodeError: bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") if encryption_key: bytearr = RC4_encrypt(encryption_key, bytearr) obj = ByteStringObject(bytearr) obj.writeToStream(stream, None) else: stream.write(b_("(")) for c in bytearr: if not chr_(c).isalnum() and c != b_(' '): stream.write(b_("\\%03o" % ord_(c))) else: stream.write(b_(chr_(c))) stream.write(b_(")")) class NameObject(str, PdfObject): delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) surfix = b_("/") def writeToStream(self, stream, encryption_key): stream.write(b_(self)) def readFromStream(stream, pdf): debug = False if debug: print((stream.tell())) name = stream.read(1) if name != NameObject.surfix: raise utils.PdfReadError("name read error") name += utils.readUntilRegex(stream, NameObject.delimiterPattern, ignore_eof=True) if debug: print(name) try: return NameObject(name.decode('utf-8')) except (UnicodeEncodeError, UnicodeDecodeError) as e: # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number if not pdf.strict: warnings.warn("Illegal character in Name Object", utils.PdfReadWarning) return NameObject(name) else: raise utils.PdfReadError("Illegal character in Name Object") readFromStream = staticmethod(readFromStream) class DictionaryObject(dict, PdfObject): def raw_get(self, key): return dict.__getitem__(self, key) def __setitem__(self, key, value): if not isinstance(key, PdfObject): raise ValueError("key must be PdfObject") if not isinstance(value, PdfObject): raise ValueError("value must be PdfObject") return dict.__setitem__(self, key, value) def setdefault(self, key, value=None): if not isinstance(key, PdfObject): raise ValueError("key must be PdfObject") if not isinstance(value, PdfObject): raise ValueError("value must be PdfObject") return dict.setdefault(self, key, value) def __getitem__(self, key): return dict.__getitem__(self, key).getObject() ## # Retrieves XMP (Extensible Metadata Platform) data relevant to the # this object, if available. #
# Stability: Added in v1.12, will exist for all future v1.x releases. # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance # that can be used to access XMP metadata from the document. Can also # return None if no metadata was found on the document root. def getXmpMetadata(self): metadata = self.get("/Metadata", None) if metadata == None: return None metadata = metadata.getObject() from . import xmp if not isinstance(metadata, xmp.XmpInformation): metadata = xmp.XmpInformation(metadata) self[NameObject("/Metadata")] = metadata return metadata ## # Read-only property that accesses the {@link # #DictionaryObject.getXmpData getXmpData} function. #
# Stability: Added in v1.12, will exist for all future v1.x releases.
xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
def writeToStream(self, stream, encryption_key):
stream.write(b_("<<\n"))
for key, value in list(self.items()):
key.writeToStream(stream, encryption_key)
stream.write(b_(" "))
value.writeToStream(stream, encryption_key)
stream.write(b_("\n"))
stream.write(b_(">>"))
def readFromStream(stream, pdf):
debug = False
tmp = stream.read(2)
if tmp != b_("<<"):
raise utils.PdfReadError("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()))
data = {}
while True:
tok = readNonWhitespace(stream)
if tok == b_('\x00'):
continue
elif tok == b_('%'):
stream.seek(-1, 1)
skipOverComment(stream)
continue
if not tok:
# stream has truncated prematurely
raise PdfStreamError("Stream has ended unexpectedly")
if debug: print(("Tok:", tok))
if tok == b_(">"):
stream.read(1)
break
stream.seek(-1, 1)
key = readObject(stream, pdf)
tok = readNonWhitespace(stream)
stream.seek(-1, 1)
value = readObject(stream, pdf)
if not data.get(key):
data[key] = value
elif pdf.strict:
# multiple definitions of key not permitted
raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \
% (utils.hexStr(stream.tell()), key))
else:
warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \
% (utils.hexStr(stream.tell()), key), utils.PdfReadWarning)
pos = stream.tell()
s = readNonWhitespace(stream)
if s == b_('s') and stream.read(5) == b_('tream'):
eol = stream.read(1)
# odd PDF file output has spaces after 'stream' keyword but before EOL.
# patch provided by Danial Sandler
while eol == b_(' '):
eol = stream.read(1)
assert eol in (b_("\n"), b_("\r"))
if eol == b_("\r"):
# read \n after
if stream.read(1) != b_('\n'):
stream.seek(-1, 1)
# this is a stream object, not a dictionary
assert "/Length" in data
length = data["/Length"]
if debug: print(data)
if isinstance(length, IndirectObject):
t = stream.tell()
length = pdf.getObject(length)
stream.seek(t, 0)
data["__streamdata__"] = stream.read(length)
if debug: print("here")
#if debug: print(binascii.hexlify(data["__streamdata__"]))
e = readNonWhitespace(stream)
ndstream = stream.read(8)
if (e + ndstream) != b_("endstream"):
# (sigh) - the odd PDF file has a length that is too long, so
# we need to read backwards to find the "endstream" ending.
# ReportLab (unknown version) generates files with this bug,
# and Python users into PDF files tend to be our audience.
# we need to do this to correct the streamdata and chop off
# an extra character.
pos = stream.tell()
stream.seek(-10, 1)
end = stream.read(9)
if end == b_("endstream"):
# we found it by looking back one character further.
data["__streamdata__"] = data["__streamdata__"][:-1]
else:
if debug: print(("E", e, ndstream, debugging.toHex(end)))
stream.seek(pos, 0)
raise utils.PdfReadError("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell()))
else:
stream.seek(pos, 0)
if "__streamdata__" in data:
return StreamObject.initializeFromDictionary(data)
else:
retval = DictionaryObject()
retval.update(data)
return retval
readFromStream = staticmethod(readFromStream)
class TreeObject(DictionaryObject):
def __init__(self):
DictionaryObject.__init__(self)
def hasChildren(self):
return '/First' in self
def __iter__(self):
return self.children()
def children(self):
if not self.hasChildren():
raise StopIteration
child = self['/First']
while True:
yield child
if child == self['/Last']:
raise StopIteration
child = child['/Next']
def addChild(self, child, pdf):
childObj = child.getObject()
child = pdf.getReference(childObj)
assert isinstance(child, IndirectObject)
if '/First' not in self:
self[NameObject('/First')] = child
self[NameObject('/Count')] = NumberObject(0)
prev = None
else:
prev = self['/Last']
self[NameObject('/Last')] = child
self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1)
if prev:
prevRef = pdf.getReference(prev)
assert isinstance(prevRef, IndirectObject)
childObj[NameObject('/Prev')] = prevRef
prev[NameObject('/Next')] = child
parentRef = pdf.getReference(self)
assert isinstance(parentRef, IndirectObject)
childObj[NameObject('/Parent')] = parentRef
def removeChild(self, child):
childObj = child.getObject()
if NameObject('/Parent') not in childObj:
raise ValueError("Removed child does not appear to be a tree item")
elif childObj[NameObject('/Parent')] != self:
raise ValueError("Removed child is not a member of this tree")
found = False
prevRef = None
prev = None
curRef = self[NameObject('/First')]
cur = curRef.getObject()
lastRef = self[NameObject('/Last')]
last = lastRef.getObject()
while cur != None:
if cur == childObj:
if prev == None:
if NameObject('/Next') in cur:
# Removing first tree node
nextRef = cur[NameObject('/Next')]
next = nextRef.getObject()
del next[NameObject('/Prev')]
self[NameObject('/First')] = nextRef
self[NameObject('/Count')] = self[NameObject('/Count')] - 1
else:
# Removing only tree node
assert self[NameObject('/Count')] == 1
del self[NameObject('/Count')]
del self[NameObject('/First')]
if NameObject('/Last') in self:
del self[NameObject('/Last')]
else:
if NameObject('/Next') in cur:
# Removing middle tree node
nextRef = cur[NameObject('/Next')]
next = nextRef.getObject()
next[NameObject('/Prev')] = prevRef
prev[NameObject('/Next')] = nextRef
self[NameObject('/Count')] = self[NameObject('/Count')] - 1
else:
# Removing last tree node
assert cur == last
del prev[NameObject('/Next')]
self[NameObject('/Last')] = prevRef
self[NameObject('/Count')] = self[NameObject('/Count')] - 1
found = True
break
prevRef = curRef
prev = cur
if NameObject('/Next') in cur:
curRef = cur[NameObject('/Next')]
cur = curRef.getObject()
else:
curRef = None
cur = None
if not found:
raise ValueError("Removal couldn't find item in tree")
del childObj[NameObject('/Parent')]
if NameObject('/Next') in childObj:
del childObj[NameObject('/Next')]
if NameObject('/Prev') in childObj:
del childObj[NameObject('/Prev')]
def emptyTree(self):
for child in self:
childObj = child.getObject()
del childObj[NameObject('/Parent')]
if NameObject('/Next') in childObj:
del childObj[NameObject('/Next')]
if NameObject('/Prev') in childObj:
del childObj[NameObject('/Prev')]
if NameObject('/Count') in self:
del self[NameObject('/Count')]
if NameObject('/First') in self:
del self[NameObject('/First')]
if NameObject('/Last') in self:
del self[NameObject('/Last')]
class StreamObject(DictionaryObject):
def __init__(self):
self._data = None
self.decodedSelf = None
def writeToStream(self, stream, encryption_key):
self[NameObject("/Length")] = NumberObject(len(self._data))
DictionaryObject.writeToStream(self, stream, encryption_key)
del self["/Length"]
stream.write(b_("\nstream\n"))
data = self._data
if encryption_key:
data = RC4_encrypt(encryption_key, data)
stream.write(data)
stream.write(b_("\nendstream"))
def initializeFromDictionary(data):
if "/Filter" in data:
retval = EncodedStreamObject()
else:
retval = DecodedStreamObject()
retval._data = data["__streamdata__"]
del data["__streamdata__"]
del data["/Length"]
retval.update(data)
return retval
initializeFromDictionary = staticmethod(initializeFromDictionary)
def flateEncode(self):
if "/Filter" in self:
f = self["/Filter"]
if isinstance(f, ArrayObject):
f.insert(0, NameObject("/FlateDecode"))
else:
newf = ArrayObject()
newf.append(NameObject("/FlateDecode"))
newf.append(f)
f = newf
else:
f = NameObject("/FlateDecode")
retval = EncodedStreamObject()
retval[NameObject("/Filter")] = f
retval._data = filters.FlateDecode.encode(self._data)
return retval
class DecodedStreamObject(StreamObject):
def getData(self):
return self._data
def setData(self, data):
self._data = data
class EncodedStreamObject(StreamObject):
def __init__(self):
self.decodedSelf = None
def getData(self):
if self.decodedSelf:
# cached version of decoded object
return self.decodedSelf.getData()
else:
# create decoded object
decoded = DecodedStreamObject()
decoded._data = filters.decodeStreamData(self)
for key, value in list(self.items()):
if not key in ("/Length", "/Filter", "/DecodeParms"):
decoded[key] = value
self.decodedSelf = decoded
return decoded._data
def setData(self, data):
raise utils.PdfReadError("Creating EncodedStreamObject is not currently supported")
class RectangleObject(ArrayObject):
"""
This class is used to represent *page boxes* in PyPDF2. These boxes include:
* :attr:`artBox