801 lines
28 KiB
Python
801 lines
28 KiB
Python
|
# vim: sw=4:expandtab:foldmethod=marker
|
||
|
#
|
||
|
# Copyright (c) 2006, Mathieu Fenniak
|
||
|
# All rights reserved.
|
||
|
#
|
||
|
# Redistribution and use in source and binary forms, with or without
|
||
|
# modification, are permitted provided that the following conditions are
|
||
|
# met:
|
||
|
#
|
||
|
# * Redistributions of source code must retain the above copyright notice,
|
||
|
# this list of conditions and the following disclaimer.
|
||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||
|
# this list of conditions and the following disclaimer in the documentation
|
||
|
# and/or other materials provided with the distribution.
|
||
|
# * The name of the author may not be used to endorse or promote products
|
||
|
# derived from this software without specific prior written permission.
|
||
|
#
|
||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||
|
|
||
|
|
||
|
"""
|
||
|
Implementation of generic PDF objects (dictionary, number, string, and so on)
|
||
|
"""
|
||
|
__author__ = "Mathieu Fenniak"
|
||
|
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||
|
|
||
|
import re
|
||
|
from utils import readNonWhitespace, RC4_encrypt
|
||
|
import filters
|
||
|
import utils
|
||
|
import decimal
|
||
|
import codecs
|
||
|
|
||
|
def readObject(stream, pdf):
|
||
|
tok = stream.read(1)
|
||
|
stream.seek(-1, 1) # reset to start
|
||
|
if tok == 't' or tok == 'f':
|
||
|
# boolean object
|
||
|
return BooleanObject.readFromStream(stream)
|
||
|
elif tok == '(':
|
||
|
# string object
|
||
|
return readStringFromStream(stream)
|
||
|
elif tok == '/':
|
||
|
# name object
|
||
|
return NameObject.readFromStream(stream)
|
||
|
elif tok == '[':
|
||
|
# array object
|
||
|
return ArrayObject.readFromStream(stream, pdf)
|
||
|
elif tok == 'n':
|
||
|
# null object
|
||
|
return NullObject.readFromStream(stream)
|
||
|
elif tok == '<':
|
||
|
# hexadecimal string OR dictionary
|
||
|
peek = stream.read(2)
|
||
|
stream.seek(-2, 1) # reset to start
|
||
|
if peek == '<<':
|
||
|
return DictionaryObject.readFromStream(stream, pdf)
|
||
|
else:
|
||
|
return readHexStringFromStream(stream)
|
||
|
elif tok == '%':
|
||
|
# comment
|
||
|
while tok not in ('\r', '\n'):
|
||
|
tok = stream.read(1)
|
||
|
tok = readNonWhitespace(stream)
|
||
|
stream.seek(-1, 1)
|
||
|
return readObject(stream, pdf)
|
||
|
else:
|
||
|
# number object OR indirect reference
|
||
|
if tok == '+' or tok == '-':
|
||
|
# number
|
||
|
return NumberObject.readFromStream(stream)
|
||
|
peek = stream.read(20)
|
||
|
stream.seek(-len(peek), 1) # reset to start
|
||
|
if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None:
|
||
|
return IndirectObject.readFromStream(stream, pdf)
|
||
|
else:
|
||
|
return NumberObject.readFromStream(stream)
|
||
|
|
||
|
class PdfObject(object):
|
||
|
def getObject(self):
|
||
|
"""Resolves indirect references."""
|
||
|
return self
|
||
|
|
||
|
|
||
|
class NullObject(PdfObject):
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
stream.write("null")
|
||
|
|
||
|
def readFromStream(stream):
|
||
|
nulltxt = stream.read(4)
|
||
|
if nulltxt != "null":
|
||
|
raise utils.PdfReadError, "error reading null object"
|
||
|
return NullObject()
|
||
|
readFromStream = staticmethod(readFromStream)
|
||
|
|
||
|
|
||
|
class BooleanObject(PdfObject):
|
||
|
def __init__(self, value):
|
||
|
self.value = value
|
||
|
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
if self.value:
|
||
|
stream.write("true")
|
||
|
else:
|
||
|
stream.write("false")
|
||
|
|
||
|
def readFromStream(stream):
|
||
|
word = stream.read(4)
|
||
|
if word == "true":
|
||
|
return BooleanObject(True)
|
||
|
elif word == "fals":
|
||
|
stream.read(1)
|
||
|
return BooleanObject(False)
|
||
|
assert False
|
||
|
readFromStream = staticmethod(readFromStream)
|
||
|
|
||
|
|
||
|
class ArrayObject(list, PdfObject):
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
stream.write("[")
|
||
|
for data in self:
|
||
|
stream.write(" ")
|
||
|
data.writeToStream(stream, encryption_key)
|
||
|
stream.write(" ]")
|
||
|
|
||
|
def readFromStream(stream, pdf):
|
||
|
arr = ArrayObject()
|
||
|
tmp = stream.read(1)
|
||
|
if tmp != "[":
|
||
|
raise utils.PdfReadError, "error reading array"
|
||
|
while True:
|
||
|
# skip leading whitespace
|
||
|
tok = stream.read(1)
|
||
|
while tok.isspace():
|
||
|
tok = stream.read(1)
|
||
|
stream.seek(-1, 1)
|
||
|
# check for array ending
|
||
|
peekahead = stream.read(1)
|
||
|
if peekahead == "]":
|
||
|
break
|
||
|
stream.seek(-1, 1)
|
||
|
# read and append obj
|
||
|
arr.append(readObject(stream, pdf))
|
||
|
return arr
|
||
|
readFromStream = staticmethod(readFromStream)
|
||
|
|
||
|
|
||
|
class IndirectObject(PdfObject):
|
||
|
def __init__(self, idnum, generation, pdf):
|
||
|
self.idnum = idnum
|
||
|
self.generation = generation
|
||
|
self.pdf = pdf
|
||
|
|
||
|
def getObject(self):
|
||
|
return self.pdf.getObject(self).getObject()
|
||
|
|
||
|
def __repr__(self):
|
||
|
return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
|
||
|
|
||
|
def __eq__(self, other):
|
||
|
return (
|
||
|
other != None and
|
||
|
isinstance(other, IndirectObject) and
|
||
|
self.idnum == other.idnum and
|
||
|
self.generation == other.generation and
|
||
|
self.pdf is other.pdf
|
||
|
)
|
||
|
|
||
|
def __ne__(self, other):
|
||
|
return not self.__eq__(other)
|
||
|
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
stream.write("%s %s R" % (self.idnum, self.generation))
|
||
|
|
||
|
def readFromStream(stream, pdf):
|
||
|
idnum = ""
|
||
|
while True:
|
||
|
tok = stream.read(1)
|
||
|
if tok.isspace():
|
||
|
break
|
||
|
idnum += tok
|
||
|
generation = ""
|
||
|
while True:
|
||
|
tok = stream.read(1)
|
||
|
if tok.isspace():
|
||
|
break
|
||
|
generation += tok
|
||
|
r = stream.read(1)
|
||
|
if r != "R":
|
||
|
raise utils.PdfReadError("error reading indirect object reference")
|
||
|
return IndirectObject(int(idnum), int(generation), pdf)
|
||
|
readFromStream = staticmethod(readFromStream)
|
||
|
|
||
|
|
||
|
class FloatObject(decimal.Decimal, PdfObject):
|
||
|
def __new__(cls, value="0", context=None):
|
||
|
return decimal.Decimal.__new__(cls, str(value), context)
|
||
|
def __repr__(self):
|
||
|
if self == self.to_integral():
|
||
|
return str(self.quantize(decimal.Decimal(1)))
|
||
|
else:
|
||
|
# XXX: this adds useless extraneous zeros.
|
||
|
return "%.5f" % self
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
stream.write(repr(self))
|
||
|
|
||
|
|
||
|
class NumberObject(int, PdfObject):
|
||
|
def __init__(self, value):
|
||
|
int.__init__(value)
|
||
|
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
stream.write(repr(self))
|
||
|
|
||
|
def readFromStream(stream):
|
||
|
name = ""
|
||
|
while True:
|
||
|
tok = stream.read(1)
|
||
|
if tok != '+' and tok != '-' and tok != '.' and not tok.isdigit():
|
||
|
stream.seek(-1, 1)
|
||
|
break
|
||
|
name += tok
|
||
|
if name.find(".") != -1:
|
||
|
return FloatObject(name)
|
||
|
else:
|
||
|
return NumberObject(name)
|
||
|
readFromStream = staticmethod(readFromStream)
|
||
|
|
||
|
|
||
|
##
|
||
|
# Given a string (either a "str" or "unicode"), create a ByteStringObject or a
|
||
|
# TextStringObject to represent the string.
|
||
|
def createStringObject(string):
|
||
|
if isinstance(string, unicode):
|
||
|
return TextStringObject(string)
|
||
|
elif isinstance(string, str):
|
||
|
if string.startswith(codecs.BOM_UTF16_BE):
|
||
|
retval = TextStringObject(string.decode("utf-16"))
|
||
|
retval.autodetect_utf16 = True
|
||
|
return retval
|
||
|
else:
|
||
|
# This is probably a big performance hit here, but we need to
|
||
|
# convert string objects into the text/unicode-aware version if
|
||
|
# possible... and the only way to check if that's possible is
|
||
|
# to try. Some strings are strings, some are just byte arrays.
|
||
|
try:
|
||
|
retval = TextStringObject(decode_pdfdocencoding(string))
|
||
|
retval.autodetect_pdfdocencoding = True
|
||
|
return retval
|
||
|
except UnicodeDecodeError:
|
||
|
return ByteStringObject(string)
|
||
|
else:
|
||
|
raise TypeError("createStringObject should have str or unicode arg")
|
||
|
|
||
|
|
||
|
def readHexStringFromStream(stream):
|
||
|
stream.read(1)
|
||
|
txt = ""
|
||
|
x = ""
|
||
|
while True:
|
||
|
tok = readNonWhitespace(stream)
|
||
|
if tok == ">":
|
||
|
break
|
||
|
x += tok
|
||
|
if len(x) == 2:
|
||
|
txt += chr(int(x, base=16))
|
||
|
x = ""
|
||
|
if len(x) == 1:
|
||
|
x += "0"
|
||
|
if len(x) == 2:
|
||
|
txt += chr(int(x, base=16))
|
||
|
return createStringObject(txt)
|
||
|
|
||
|
|
||
|
def readStringFromStream(stream):
|
||
|
tok = stream.read(1)
|
||
|
parens = 1
|
||
|
txt = ""
|
||
|
while True:
|
||
|
tok = stream.read(1)
|
||
|
if tok == "(":
|
||
|
parens += 1
|
||
|
elif tok == ")":
|
||
|
parens -= 1
|
||
|
if parens == 0:
|
||
|
break
|
||
|
elif tok == "\\":
|
||
|
tok = stream.read(1)
|
||
|
if tok == "n":
|
||
|
tok = "\n"
|
||
|
elif tok == "r":
|
||
|
tok = "\r"
|
||
|
elif tok == "t":
|
||
|
tok = "\t"
|
||
|
elif tok == "b":
|
||
|
tok = "\b"
|
||
|
elif tok == "f":
|
||
|
tok = "\f"
|
||
|
elif tok == "(":
|
||
|
tok = "("
|
||
|
elif tok == ")":
|
||
|
tok = ")"
|
||
|
elif tok == "\\":
|
||
|
tok = "\\"
|
||
|
elif tok.isdigit():
|
||
|
# "The number ddd may consist of one, two, or three
|
||
|
# octal digits; high-order overflow shall be ignored.
|
||
|
# Three octal digits shall be used, with leading zeros
|
||
|
# as needed, if the next character of the string is also
|
||
|
# a digit." (PDF reference 7.3.4.2, p 16)
|
||
|
for i in range(2):
|
||
|
ntok = stream.read(1)
|
||
|
if ntok.isdigit():
|
||
|
tok += ntok
|
||
|
else:
|
||
|
break
|
||
|
tok = chr(int(tok, base=8))
|
||
|
elif tok in "\n\r":
|
||
|
# This case is hit when a backslash followed by a line
|
||
|
# break occurs. If it's a multi-char EOL, consume the
|
||
|
# second character:
|
||
|
tok = stream.read(1)
|
||
|
if not tok in "\n\r":
|
||
|
stream.seek(-1, 1)
|
||
|
# Then don't add anything to the actual string, since this
|
||
|
# line break was escaped:
|
||
|
tok = ''
|
||
|
else:
|
||
|
raise utils.PdfReadError("Unexpected escaped string")
|
||
|
txt += tok
|
||
|
return createStringObject(txt)
|
||
|
|
||
|
|
||
|
##
|
||
|
# Represents a string object where the text encoding could not be determined.
|
||
|
# This occurs quite often, as the PDF spec doesn't provide an alternate way to
|
||
|
# represent strings -- for example, the encryption data stored in files (like
|
||
|
# /O) is clearly not text, but is still stored in a "String" object.
|
||
|
class ByteStringObject(str, PdfObject):
|
||
|
|
||
|
##
|
||
|
# For compatibility with TextStringObject.original_bytes. This method
|
||
|
# returns self.
|
||
|
original_bytes = property(lambda self: self)
|
||
|
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
bytearr = self
|
||
|
if encryption_key:
|
||
|
bytearr = RC4_encrypt(encryption_key, bytearr)
|
||
|
stream.write("<")
|
||
|
stream.write(bytearr.encode("hex"))
|
||
|
stream.write(">")
|
||
|
|
||
|
|
||
|
##
|
||
|
# Represents a string object that has been decoded into a real unicode string.
|
||
|
# If read from a PDF document, this string appeared to match the
|
||
|
# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
|
||
|
# occur.
|
||
|
class TextStringObject(unicode, PdfObject):
|
||
|
autodetect_pdfdocencoding = False
|
||
|
autodetect_utf16 = False
|
||
|
|
||
|
##
|
||
|
# It is occasionally possible that a text string object gets created where
|
||
|
# a byte string object was expected due to the autodetection mechanism --
|
||
|
# if that occurs, this "original_bytes" property can be used to
|
||
|
# back-calculate what the original encoded bytes were.
|
||
|
original_bytes = property(lambda self: self.get_original_bytes())
|
||
|
|
||
|
def get_original_bytes(self):
|
||
|
# We're a text string object, but the library is trying to get our raw
|
||
|
# bytes. This can happen if we auto-detected this string as text, but
|
||
|
# we were wrong. It's pretty common. Return the original bytes that
|
||
|
# would have been used to create this object, based upon the autodetect
|
||
|
# method.
|
||
|
if self.autodetect_utf16:
|
||
|
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
||
|
elif self.autodetect_pdfdocencoding:
|
||
|
return encode_pdfdocencoding(self)
|
||
|
else:
|
||
|
raise Exception("no information about original bytes")
|
||
|
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
# Try to write the string out as a PDFDocEncoding encoded string. It's
|
||
|
# nicer to look at in the PDF file. Sadly, we take a performance hit
|
||
|
# here for trying...
|
||
|
try:
|
||
|
bytearr = encode_pdfdocencoding(self)
|
||
|
except UnicodeEncodeError:
|
||
|
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
||
|
if encryption_key:
|
||
|
bytearr = RC4_encrypt(encryption_key, bytearr)
|
||
|
obj = ByteStringObject(bytearr)
|
||
|
obj.writeToStream(stream, None)
|
||
|
else:
|
||
|
stream.write("(")
|
||
|
for c in bytearr:
|
||
|
if not c.isalnum() and c != ' ':
|
||
|
stream.write("\\%03o" % ord(c))
|
||
|
else:
|
||
|
stream.write(c)
|
||
|
stream.write(")")
|
||
|
|
||
|
|
||
|
class NameObject(str, PdfObject):
|
||
|
delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%"
|
||
|
|
||
|
def __init__(self, data):
|
||
|
str.__init__(data)
|
||
|
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
stream.write(self)
|
||
|
|
||
|
def readFromStream(stream):
|
||
|
name = stream.read(1)
|
||
|
if name != "/":
|
||
|
raise utils.PdfReadError, "name read error"
|
||
|
while True:
|
||
|
tok = stream.read(1)
|
||
|
if tok.isspace() or tok in NameObject.delimiterCharacters:
|
||
|
stream.seek(-1, 1)
|
||
|
break
|
||
|
name += tok
|
||
|
return NameObject(name)
|
||
|
readFromStream = staticmethod(readFromStream)
|
||
|
|
||
|
|
||
|
class DictionaryObject(dict, PdfObject):
|
||
|
|
||
|
def __init__(self, *args, **kwargs):
|
||
|
if len(args) == 0:
|
||
|
self.update(kwargs)
|
||
|
elif len(args) == 1:
|
||
|
arr = args[0]
|
||
|
# If we're passed a list/tuple, make a dict out of it
|
||
|
if not hasattr(arr, "iteritems"):
|
||
|
newarr = {}
|
||
|
for k, v in arr:
|
||
|
newarr[k] = v
|
||
|
arr = newarr
|
||
|
self.update(arr)
|
||
|
else:
|
||
|
raise TypeError("dict expected at most 1 argument, got 3")
|
||
|
|
||
|
def update(self, arr):
|
||
|
# note, a ValueError halfway through copying values
|
||
|
# will leave half the values in this dict.
|
||
|
for k, v in arr.iteritems():
|
||
|
self.__setitem__(k, v)
|
||
|
|
||
|
def raw_get(self, key):
|
||
|
return dict.__getitem__(self, key)
|
||
|
|
||
|
def __setitem__(self, key, value):
|
||
|
if not isinstance(key, PdfObject):
|
||
|
raise ValueError("key must be PdfObject")
|
||
|
if not isinstance(value, PdfObject):
|
||
|
raise ValueError("value must be PdfObject")
|
||
|
return dict.__setitem__(self, key, value)
|
||
|
|
||
|
def setdefault(self, key, value=None):
|
||
|
if not isinstance(key, PdfObject):
|
||
|
raise ValueError("key must be PdfObject")
|
||
|
if not isinstance(value, PdfObject):
|
||
|
raise ValueError("value must be PdfObject")
|
||
|
return dict.setdefault(self, key, value)
|
||
|
|
||
|
def __getitem__(self, key):
|
||
|
return dict.__getitem__(self, key).getObject()
|
||
|
|
||
|
##
|
||
|
# Retrieves XMP (Extensible Metadata Platform) data relevant to the
|
||
|
# this object, if available.
|
||
|
# <p>
|
||
|
# Stability: Added in v1.12, will exist for all future v1.x releases.
|
||
|
# @return Returns a {@link #xmp.XmpInformation XmlInformation} instance
|
||
|
# that can be used to access XMP metadata from the document. Can also
|
||
|
# return None if no metadata was found on the document root.
|
||
|
def getXmpMetadata(self):
|
||
|
metadata = self.get("/Metadata", None)
|
||
|
if metadata == None:
|
||
|
return None
|
||
|
metadata = metadata.getObject()
|
||
|
import xmp
|
||
|
if not isinstance(metadata, xmp.XmpInformation):
|
||
|
metadata = xmp.XmpInformation(metadata)
|
||
|
self[NameObject("/Metadata")] = metadata
|
||
|
return metadata
|
||
|
|
||
|
##
|
||
|
# Read-only property that accesses the {@link
|
||
|
# #DictionaryObject.getXmpData getXmpData} function.
|
||
|
# <p>
|
||
|
# Stability: Added in v1.12, will exist for all future v1.x releases.
|
||
|
xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
|
||
|
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
stream.write("<<\n")
|
||
|
for key, value in self.items():
|
||
|
key.writeToStream(stream, encryption_key)
|
||
|
stream.write(" ")
|
||
|
value.writeToStream(stream, encryption_key)
|
||
|
stream.write("\n")
|
||
|
stream.write(">>")
|
||
|
|
||
|
def readFromStream(stream, pdf):
|
||
|
tmp = stream.read(2)
|
||
|
if tmp != "<<":
|
||
|
raise utils.PdfReadError, "dictionary read error"
|
||
|
data = {}
|
||
|
while True:
|
||
|
tok = readNonWhitespace(stream)
|
||
|
if tok == ">":
|
||
|
stream.read(1)
|
||
|
break
|
||
|
stream.seek(-1, 1)
|
||
|
key = readObject(stream, pdf)
|
||
|
tok = readNonWhitespace(stream)
|
||
|
stream.seek(-1, 1)
|
||
|
value = readObject(stream, pdf)
|
||
|
if data.has_key(key):
|
||
|
# multiple definitions of key not permitted
|
||
|
raise utils.PdfReadError, "multiple definitions in dictionary"
|
||
|
data[key] = value
|
||
|
pos = stream.tell()
|
||
|
s = readNonWhitespace(stream)
|
||
|
if s == 's' and stream.read(5) == 'tream':
|
||
|
eol = stream.read(1)
|
||
|
# odd PDF file output has spaces after 'stream' keyword but before EOL.
|
||
|
# patch provided by Danial Sandler
|
||
|
while eol == ' ':
|
||
|
eol = stream.read(1)
|
||
|
assert eol in ("\n", "\r")
|
||
|
if eol == "\r":
|
||
|
# read \n after
|
||
|
stream.read(1)
|
||
|
# this is a stream object, not a dictionary
|
||
|
assert data.has_key("/Length")
|
||
|
length = data["/Length"]
|
||
|
if isinstance(length, IndirectObject):
|
||
|
t = stream.tell()
|
||
|
length = pdf.getObject(length)
|
||
|
stream.seek(t, 0)
|
||
|
data["__streamdata__"] = stream.read(length)
|
||
|
e = readNonWhitespace(stream)
|
||
|
ndstream = stream.read(8)
|
||
|
if (e + ndstream) != "endstream":
|
||
|
# (sigh) - the odd PDF file has a length that is too long, so
|
||
|
# we need to read backwards to find the "endstream" ending.
|
||
|
# ReportLab (unknown version) generates files with this bug,
|
||
|
# and Python users into PDF files tend to be our audience.
|
||
|
# we need to do this to correct the streamdata and chop off
|
||
|
# an extra character.
|
||
|
pos = stream.tell()
|
||
|
stream.seek(-10, 1)
|
||
|
end = stream.read(9)
|
||
|
if end == "endstream":
|
||
|
# we found it by looking back one character further.
|
||
|
data["__streamdata__"] = data["__streamdata__"][:-1]
|
||
|
else:
|
||
|
stream.seek(pos, 0)
|
||
|
raise utils.PdfReadError, "Unable to find 'endstream' marker after stream."
|
||
|
else:
|
||
|
stream.seek(pos, 0)
|
||
|
if data.has_key("__streamdata__"):
|
||
|
return StreamObject.initializeFromDictionary(data)
|
||
|
else:
|
||
|
retval = DictionaryObject()
|
||
|
retval.update(data)
|
||
|
return retval
|
||
|
readFromStream = staticmethod(readFromStream)
|
||
|
|
||
|
|
||
|
class StreamObject(DictionaryObject):
|
||
|
def __init__(self):
|
||
|
self._data = None
|
||
|
self.decodedSelf = None
|
||
|
|
||
|
def writeToStream(self, stream, encryption_key):
|
||
|
self[NameObject("/Length")] = NumberObject(len(self._data))
|
||
|
DictionaryObject.writeToStream(self, stream, encryption_key)
|
||
|
del self["/Length"]
|
||
|
stream.write("\nstream\n")
|
||
|
data = self._data
|
||
|
if encryption_key:
|
||
|
data = RC4_encrypt(encryption_key, data)
|
||
|
stream.write(data)
|
||
|
stream.write("\nendstream")
|
||
|
|
||
|
def initializeFromDictionary(data):
|
||
|
if data.has_key("/Filter"):
|
||
|
retval = EncodedStreamObject()
|
||
|
else:
|
||
|
retval = DecodedStreamObject()
|
||
|
retval._data = data["__streamdata__"]
|
||
|
del data["__streamdata__"]
|
||
|
del data["/Length"]
|
||
|
retval.update(data)
|
||
|
return retval
|
||
|
initializeFromDictionary = staticmethod(initializeFromDictionary)
|
||
|
|
||
|
def flateEncode(self):
|
||
|
if self.has_key("/Filter"):
|
||
|
f = self["/Filter"]
|
||
|
if isinstance(f, ArrayObject):
|
||
|
f.insert(0, NameObject("/FlateDecode"))
|
||
|
else:
|
||
|
newf = ArrayObject()
|
||
|
newf.append(NameObject("/FlateDecode"))
|
||
|
newf.append(f)
|
||
|
f = newf
|
||
|
else:
|
||
|
f = NameObject("/FlateDecode")
|
||
|
retval = EncodedStreamObject()
|
||
|
retval[NameObject("/Filter")] = f
|
||
|
retval._data = filters.FlateDecode.encode(self._data)
|
||
|
return retval
|
||
|
|
||
|
|
||
|
class DecodedStreamObject(StreamObject):
|
||
|
def getData(self):
|
||
|
return self._data
|
||
|
|
||
|
def setData(self, data):
|
||
|
self._data = data
|
||
|
|
||
|
|
||
|
class EncodedStreamObject(StreamObject):
|
||
|
def __init__(self):
|
||
|
self.decodedSelf = None
|
||
|
|
||
|
def getData(self):
|
||
|
if self.decodedSelf:
|
||
|
# cached version of decoded object
|
||
|
return self.decodedSelf.getData()
|
||
|
else:
|
||
|
# create decoded object
|
||
|
decoded = DecodedStreamObject()
|
||
|
decoded._data = filters.decodeStreamData(self)
|
||
|
for key, value in self.items():
|
||
|
if not key in ("/Length", "/Filter", "/DecodeParms"):
|
||
|
decoded[key] = value
|
||
|
self.decodedSelf = decoded
|
||
|
return decoded._data
|
||
|
|
||
|
def setData(self, data):
|
||
|
raise utils.PdfReadError, "Creating EncodedStreamObject is not currently supported"
|
||
|
|
||
|
|
||
|
class RectangleObject(ArrayObject):
|
||
|
def __init__(self, arr):
|
||
|
# must have four points
|
||
|
assert len(arr) == 4
|
||
|
# automatically convert arr[x] into NumberObject(arr[x]) if necessary
|
||
|
ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr])
|
||
|
|
||
|
def ensureIsNumber(self, value):
|
||
|
if not isinstance(value, (NumberObject, FloatObject)):
|
||
|
value = FloatObject(value)
|
||
|
return value
|
||
|
|
||
|
def __repr__(self):
|
||
|
return "RectangleObject(%s)" % repr(list(self))
|
||
|
|
||
|
def getLowerLeft_x(self):
|
||
|
return self[0]
|
||
|
|
||
|
def getLowerLeft_y(self):
|
||
|
return self[1]
|
||
|
|
||
|
def getUpperRight_x(self):
|
||
|
return self[2]
|
||
|
|
||
|
def getUpperRight_y(self):
|
||
|
return self[3]
|
||
|
|
||
|
def getUpperLeft_x(self):
|
||
|
return self.getLowerLeft_x()
|
||
|
|
||
|
def getUpperLeft_y(self):
|
||
|
return self.getUpperRight_y()
|
||
|
|
||
|
def getLowerRight_x(self):
|
||
|
return self.getUpperRight_x()
|
||
|
|
||
|
def getLowerRight_y(self):
|
||
|
return self.getLowerLeft_y()
|
||
|
|
||
|
def getLowerLeft(self):
|
||
|
return self.getLowerLeft_x(), self.getLowerLeft_y()
|
||
|
|
||
|
def getLowerRight(self):
|
||
|
return self.getLowerRight_x(), self.getLowerRight_y()
|
||
|
|
||
|
def getUpperLeft(self):
|
||
|
return self.getUpperLeft_x(), self.getUpperLeft_y()
|
||
|
|
||
|
def getUpperRight(self):
|
||
|
return self.getUpperRight_x(), self.getUpperRight_y()
|
||
|
|
||
|
def setLowerLeft(self, value):
|
||
|
self[0], self[1] = [self.ensureIsNumber(x) for x in value]
|
||
|
|
||
|
def setLowerRight(self, value):
|
||
|
self[2], self[1] = [self.ensureIsNumber(x) for x in value]
|
||
|
|
||
|
def setUpperLeft(self, value):
|
||
|
self[0], self[3] = [self.ensureIsNumber(x) for x in value]
|
||
|
|
||
|
def setUpperRight(self, value):
|
||
|
self[2], self[3] = [self.ensureIsNumber(x) for x in value]
|
||
|
|
||
|
def getWidth(self):
|
||
|
return self.getUpperRight_x() - self.getLowerLeft_x()
|
||
|
|
||
|
def getHeight(self):
|
||
|
return self.getUpperRight_y() - self.getLowerLeft_x()
|
||
|
|
||
|
lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
|
||
|
lowerRight = property(getLowerRight, setLowerRight, None, None)
|
||
|
upperLeft = property(getUpperLeft, setUpperLeft, None, None)
|
||
|
upperRight = property(getUpperRight, setUpperRight, None, None)
|
||
|
|
||
|
|
||
|
def encode_pdfdocencoding(unicode_string):
|
||
|
retval = ''
|
||
|
for c in unicode_string:
|
||
|
try:
|
||
|
retval += chr(_pdfDocEncoding_rev[c])
|
||
|
except KeyError:
|
||
|
raise UnicodeEncodeError("pdfdocencoding", c, -1, -1,
|
||
|
"does not exist in translation table")
|
||
|
return retval
|
||
|
|
||
|
def decode_pdfdocencoding(byte_array):
|
||
|
retval = u''
|
||
|
for b in byte_array:
|
||
|
c = _pdfDocEncoding[ord(b)]
|
||
|
if c == u'\u0000':
|
||
|
raise UnicodeDecodeError("pdfdocencoding", b, -1, -1,
|
||
|
"does not exist in translation table")
|
||
|
retval += c
|
||
|
return retval
|
||
|
|
||
|
_pdfDocEncoding = (
|
||
|
u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
|
||
|
u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
|
||
|
u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000',
|
||
|
u'\u02d8', u'\u02c7', u'\u02c6', u'\u02d9', u'\u02dd', u'\u02db', u'\u02da', u'\u02dc',
|
||
|
u'\u0020', u'\u0021', u'\u0022', u'\u0023', u'\u0024', u'\u0025', u'\u0026', u'\u0027',
|
||
|
u'\u0028', u'\u0029', u'\u002a', u'\u002b', u'\u002c', u'\u002d', u'\u002e', u'\u002f',
|
||
|
u'\u0030', u'\u0031', u'\u0032', u'\u0033', u'\u0034', u'\u0035', u'\u0036', u'\u0037',
|
||
|
u'\u0038', u'\u0039', u'\u003a', u'\u003b', u'\u003c', u'\u003d', u'\u003e', u'\u003f',
|
||
|
u'\u0040', u'\u0041', u'\u0042', u'\u0043', u'\u0044', u'\u0045', u'\u0046', u'\u0047',
|
||
|
u'\u0048', u'\u0049', u'\u004a', u'\u004b', u'\u004c', u'\u004d', u'\u004e', u'\u004f',
|
||
|
u'\u0050', u'\u0051', u'\u0052', u'\u0053', u'\u0054', u'\u0055', u'\u0056', u'\u0057',
|
||
|
u'\u0058', u'\u0059', u'\u005a', u'\u005b', u'\u005c', u'\u005d', u'\u005e', u'\u005f',
|
||
|
u'\u0060', u'\u0061', u'\u0062', u'\u0063', u'\u0064', u'\u0065', u'\u0066', u'\u0067',
|
||
|
u'\u0068', u'\u0069', u'\u006a', u'\u006b', u'\u006c', u'\u006d', u'\u006e', u'\u006f',
|
||
|
u'\u0070', u'\u0071', u'\u0072', u'\u0073', u'\u0074', u'\u0075', u'\u0076', u'\u0077',
|
||
|
u'\u0078', u'\u0079', u'\u007a', u'\u007b', u'\u007c', u'\u007d', u'\u007e', u'\u0000',
|
||
|
u'\u2022', u'\u2020', u'\u2021', u'\u2026', u'\u2014', u'\u2013', u'\u0192', u'\u2044',
|
||
|
u'\u2039', u'\u203a', u'\u2212', u'\u2030', u'\u201e', u'\u201c', u'\u201d', u'\u2018',
|
||
|
u'\u2019', u'\u201a', u'\u2122', u'\ufb01', u'\ufb02', u'\u0141', u'\u0152', u'\u0160',
|
||
|
u'\u0178', u'\u017d', u'\u0131', u'\u0142', u'\u0153', u'\u0161', u'\u017e', u'\u0000',
|
||
|
u'\u20ac', u'\u00a1', u'\u00a2', u'\u00a3', u'\u00a4', u'\u00a5', u'\u00a6', u'\u00a7',
|
||
|
u'\u00a8', u'\u00a9', u'\u00aa', u'\u00ab', u'\u00ac', u'\u0000', u'\u00ae', u'\u00af',
|
||
|
u'\u00b0', u'\u00b1', u'\u00b2', u'\u00b3', u'\u00b4', u'\u00b5', u'\u00b6', u'\u00b7',
|
||
|
u'\u00b8', u'\u00b9', u'\u00ba', u'\u00bb', u'\u00bc', u'\u00bd', u'\u00be', u'\u00bf',
|
||
|
u'\u00c0', u'\u00c1', u'\u00c2', u'\u00c3', u'\u00c4', u'\u00c5', u'\u00c6', u'\u00c7',
|
||
|
u'\u00c8', u'\u00c9', u'\u00ca', u'\u00cb', u'\u00cc', u'\u00cd', u'\u00ce', u'\u00cf',
|
||
|
u'\u00d0', u'\u00d1', u'\u00d2', u'\u00d3', u'\u00d4', u'\u00d5', u'\u00d6', u'\u00d7',
|
||
|
u'\u00d8', u'\u00d9', u'\u00da', u'\u00db', u'\u00dc', u'\u00dd', u'\u00de', u'\u00df',
|
||
|
u'\u00e0', u'\u00e1', u'\u00e2', u'\u00e3', u'\u00e4', u'\u00e5', u'\u00e6', u'\u00e7',
|
||
|
u'\u00e8', u'\u00e9', u'\u00ea', u'\u00eb', u'\u00ec', u'\u00ed', u'\u00ee', u'\u00ef',
|
||
|
u'\u00f0', u'\u00f1', u'\u00f2', u'\u00f3', u'\u00f4', u'\u00f5', u'\u00f6', u'\u00f7',
|
||
|
u'\u00f8', u'\u00f9', u'\u00fa', u'\u00fb', u'\u00fc', u'\u00fd', u'\u00fe', u'\u00ff'
|
||
|
)
|
||
|
|
||
|
assert len(_pdfDocEncoding) == 256
|
||
|
|
||
|
_pdfDocEncoding_rev = {}
|
||
|
for i in xrange(256):
|
||
|
char = _pdfDocEncoding[i]
|
||
|
if char == u"\u0000":
|
||
|
continue
|
||
|
assert char not in _pdfDocEncoding_rev
|
||
|
_pdfDocEncoding_rev[char] = i
|
||
|
|