summaryrefslogtreecommitdiff
path: root/lib/pdfrw
diff options
context:
space:
mode:
authorjvoisin2011-08-16 18:11:24 +0200
committerjvoisin2011-08-16 18:11:24 +0200
commit4bd3e47da02fde08acfada1795cc55170abdb00a (patch)
treef8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /lib/pdfrw
parentbaf8e080125614326ba9c96ca8f2404fd12b050e (diff)
setup.py now works !
Diffstat (limited to 'lib/pdfrw')
-rw-r--r--lib/pdfrw/__init__.py14
-rw-r--r--lib/pdfrw/pdfcompress.py57
-rw-r--r--lib/pdfrw/pdfobjects.py183
-rw-r--r--lib/pdfrw/pdfreader.py213
-rw-r--r--lib/pdfrw/pdftokens.py249
-rw-r--r--lib/pdfrw/pdfwriter.py234
6 files changed, 0 insertions, 950 deletions
diff --git a/lib/pdfrw/__init__.py b/lib/pdfrw/__init__.py
deleted file mode 100644
index 26e8c73..0000000
--- a/lib/pdfrw/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5from pdfwriter import PdfWriter
6from pdfreader import PdfReader
7from pdfobjects import PdfObject, PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfString
8from pdftokens import PdfTokens
9
10# Add a tiny bit of compatibility to pyPdf
11
12PdfFileReader = PdfReader
13PdfFileWriter = PdfWriter
14
diff --git a/lib/pdfrw/pdfcompress.py b/lib/pdfrw/pdfcompress.py
deleted file mode 100644
index 1c11970..0000000
--- a/lib/pdfrw/pdfcompress.py
+++ /dev/null
@@ -1,57 +0,0 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6Currently, this sad little file only knows how to decompress
7using the flate (zlib) algorithm. Maybe more later, but it's
8not a priority for me...
9'''
10
11from __future__ import generators
12
13try:
14 set
15except NameError:
16 from sets import Set as set
17
18import zlib
19from pdfobjects import PdfDict, PdfName
20
21
22def streamobjects(mylist):
23 for obj in mylist:
24 if isinstance(obj, PdfDict) and obj.stream is not None:
25 yield obj
26
27def uncompress(mylist, warnings=set()):
28 flate = PdfName.FlateDecode
29 for obj in streamobjects(mylist):
30 ftype = obj.Filter
31 if ftype is None:
32 continue
33 if isinstance(ftype, list) and len(ftype) == 1:
34 # todo: multiple filters
35 ftype = ftype[0]
36 parms = obj.DecodeParms
37 if ftype != flate or parms is not None:
38 msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms))
39 if msg not in warnings:
40 warnings.add(msg)
41 print msg
42 else:
43 obj.stream = zlib.decompress(obj.stream)
44 obj.Filter = None
45
46def compress(mylist):
47 flate = PdfName.FlateDecode
48 for obj in streamobjects(mylist):
49 ftype = obj.Filter
50 if ftype is not None:
51 continue
52 oldstr = obj.stream
53 newstr = zlib.compress(oldstr)
54 if len(newstr) < len(oldstr) + 30:
55 obj.stream = newstr
56 obj.Filter = flate
57 obj.DecodeParms = None
diff --git a/lib/pdfrw/pdfobjects.py b/lib/pdfrw/pdfobjects.py
deleted file mode 100644
index 08ad825..0000000
--- a/lib/pdfrw/pdfobjects.py
+++ /dev/null
@@ -1,183 +0,0 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6Objects that can occur in PDF files. The most important
7objects are arrays and dicts. Either of these can be
8indirect or not, and dicts could have an associated
9stream.
10'''
11from __future__ import generators
12
13try:
14 set
15except NameError:
16 from sets import Set as set
17
18import re
19
20class PdfObject(str):
21 indirect = False
22
23class PdfArray(list):
24 indirect = False
25
26class PdfName(object):
27 def __getattr__(self, name):
28 return self(name)
29 def __call__(self, name):
30 return PdfObject('/' + name)
31
32PdfName = PdfName()
33
34class PdfString(str):
35 indirect = False
36 unescape_dict = {'\\b':'\b', '\\f':'\f', '\\n':'\n',
37 '\\r':'\r', '\\t':'\t',
38 '\\\r\n': '', '\\\r':'', '\\\n':'',
39 '\\\\':'\\', '\\':'',
40 }
41 unescape_pattern = r'(\\b|\\f|\\n|\\r|\\t|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)'
42 unescape_func = re.compile(unescape_pattern).split
43
44 hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
45 hex_func = re.compile(hex_pattern).split
46
47 hex_pattern2 = '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
48 hex_func2 = re.compile(hex_pattern2).split
49
50 hex_funcs = hex_func, hex_func2
51
52 indirect = False
53
54 def decode_regular(self, remap=chr):
55 assert self[0] == '(' and self[-1] == ')'
56 mylist = self.unescape_func(self[1:-1])
57 result = []
58 unescape = self.unescape_dict.get
59 for chunk in mylist:
60 chunk = unescape(chunk, chunk)
61 if chunk.startswith('\\') and len(chunk) > 1:
62 value = int(chunk[1:], 8)
63 # FIXME: TODO: Handle unicode here
64 if value > 127:
65 value = 127
66 chunk = remap(value)
67 if chunk:
68 result.append(chunk)
69 return ''.join(result)
70
71 def decode_hex(self, remap=chr, twobytes=False):
72 data = ''.join(self.split())
73 data = self.hex_funcs[twobytes](data)
74 chars = data[1::2]
75 other = data[0::2]
76 assert other[0] == '<' and other[-1] == '>' and ''.join(other) == '<>', self
77 return ''.join([remap(int(x, 16)) for x in chars])
78
79 def decode(self, remap=chr, twobytes=False):
80 if self.startswith('('):
81 return self.decode_regular(remap)
82
83 else:
84 return self.decode_hex(remap, twobytes)
85
86 def encode(cls, source, usehex=False):
87 assert not usehex, "Not supported yet"
88 if isinstance(source, unicode):
89 source = source.encode('utf-8')
90 else:
91 source = str(source)
92 source = source.replace('\\', '\\\\')
93 source = source.replace('(', '\\(')
94 source = source.replace(')', '\\)')
95 return cls('(' +source + ')')
96 encode = classmethod(encode)
97
98class PdfDict(dict):
99 indirect = False
100 stream = None
101
102 _special = dict(indirect = ('indirect', False),
103 stream = ('stream', True),
104 _stream = ('stream', False),
105 )
106
107 def __setitem__(self, name, value):
108 assert name.startswith('/'), name
109 if value is not None:
110 dict.__setitem__(self, name, value)
111 elif name in self:
112 del self[name]
113
114 def __init__(self, *args, **kw):
115 if args:
116 if len(args) == 1:
117 args = args[0]
118 self.update(args)
119 if isinstance(args, PdfDict):
120 self.indirect = args.indirect
121 self._stream = args.stream
122 for key, value in kw.iteritems():
123 setattr(self, key, value)
124
125 def __getattr__(self, name):
126 return self.get(PdfName(name))
127
128 def __setattr__(self, name, value):
129 info = self._special.get(name)
130 if info is None:
131 self[PdfName(name)] = value
132 else:
133 name, setlen = info
134 self.__dict__[name] = value
135 if setlen:
136 notnone = value is not None
137 self.Length = notnone and PdfObject(len(value)) or None
138
139 def iteritems(self):
140 for key, value in dict.iteritems(self):
141 if value is not None:
142 assert key.startswith('/'), (key, value)
143 yield key, value
144
145 def inheritable(self):
146 ''' Search through ancestors as needed for inheritable
147 dictionary items
148 '''
149 class Search(object):
150 def __init__(self, basedict):
151 self.basedict = basedict
152 def __getattr__(self, name):
153 return self[name]
154 def __getitem__(self, name):
155 visited = set()
156 mydict = self.basedict
157 while 1:
158 value = getattr(mydict, name)
159 if value is not None:
160 return value
161 myid = id(mydict)
162 assert myid not in visited
163 visited.add(myid)
164 mydict = mydict.Parent
165 if mydict is None:
166 return
167 return Search(self)
168 inheritable = property(inheritable)
169
170 def private(self):
171 ''' Allows setting private metadata for use in
172 processing (not sent to PDF file)
173 '''
174 class Private(object):
175 pass
176
177 result = Private()
178 result.__dict__ = self.__dict__
179 return result
180 private = property(private)
181
182class IndirectPdfDict(PdfDict):
183 indirect = True
diff --git a/lib/pdfrw/pdfreader.py b/lib/pdfrw/pdfreader.py
deleted file mode 100644
index 6f57bea..0000000
--- a/lib/pdfrw/pdfreader.py
+++ /dev/null
@@ -1,213 +0,0 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6The PdfReader class reads an entire PDF file into memory and
7parses the top-level container objects. (It does not parse
8into streams.) The object subclasses PdfDict, and the
9document pages are stored in a list in the pages attribute
10of the object.
11'''
12
13from pdftokens import PdfTokens
14from pdfobjects import PdfDict, PdfArray, PdfName
15from pdfcompress import uncompress
16
17class PdfReader(PdfDict):
18
19 class unresolved:
20 # Used as a placeholder until we have an object.
21 pass
22
23 def readindirect(self, objnum, gennum):
24 ''' Read an indirect object. If it has already
25 been read, return it from the cache.
26 '''
27
28 def setobj(obj):
29 # Store the new object in the dictionary
30 # once we have its value
31 record[1] = obj
32
33 def ordinary(source, setobj, obj):
34 # Deal with an ordinary (non-array, non-dict) object
35 setobj(obj)
36 return obj
37
38 fdata, objnum, gennum = self.fdata, int(objnum), int(gennum)
39 record = self.indirect_objects[fdata, objnum, gennum]
40 if record[1] is not self.unresolved:
41 return record[1]
42
43 # Read the object header and validate it
44 source = PdfTokens(fdata, record[0])
45 objid = source.multiple(3)
46 assert int(objid[0]) == objnum, objid
47 assert int(objid[1]) == gennum, objid
48 assert objid[2] == 'obj', objid
49
50 # Read the object, and call special code if it starts
51 # an array or dictionary
52 obj = source.next()
53 obj = self.special.get(obj, ordinary)(source, setobj, obj)
54 self.readstream(obj, source)
55 obj.indirect = True
56 return obj
57
58 def readstream(obj, source):
59 ''' Read optional stream following a dictionary
60 object.
61 '''
62 tok = source.next()
63 if tok == 'endobj':
64 return # No stream
65
66 assert isinstance(obj, PdfDict)
67 assert tok == 'stream', tok
68 fdata = source.fdata
69 floc = fdata.rindex(tok, 0, source.floc) + len(tok)
70 ch = fdata[floc]
71 if ch == '\r':
72 floc += 1
73 ch = fdata[floc]
74 assert ch == '\n'
75 startstream = floc + 1
76 endstream = startstream + int(obj.Length)
77 obj._stream = fdata[startstream:endstream]
78 source = PdfTokens(fdata, endstream)
79 endit = source.multiple(2)
80 if endit != 'endstream endobj'.split():
81 # /Length attribute is broken, try to read stream
82 # anyway disregarding the specified value
83 # TODO: issue warning here once we have some kind of
84 # logging
85 endstream = fdata.index('endstream', startstream)
86 if fdata[endstream-2:endstream] == '\r\n':
87 endstream -= 2
88 elif fdata[endstream-1] in ['\n', '\r']:
89 endstream -= 1
90 source = PdfTokens(fdata, endstream)
91 endit = source.multiple(2)
92 assert endit == 'endstream endobj'.split()
93 obj.Length = str(endstream-startstream)
94 obj._stream = fdata[startstream:endstream]
95 readstream = staticmethod(readstream)
96
97 def readarray(self, source, setobj=lambda x:None, original=None):
98 special = self.special
99 result = PdfArray()
100 setobj(result)
101
102 for value in source:
103 if value == ']':
104 break
105 if value in special:
106 value = special[value](source)
107 elif value == 'R':
108 generation = result.pop()
109 value = self.readindirect(result.pop(), generation)
110 result.append(value)
111 return result
112
113 def readdict(self, source, setobj=lambda x:None, original=None):
114 special = self.special
115 result = PdfDict()
116 setobj(result)
117
118 tok = source.next()
119 while tok != '>>':
120 assert tok.startswith('/'), (tok, source.multiple(10))
121 key = tok
122 value = source.next()
123 if value in special:
124 value = special[value](source)
125 tok = source.next()
126 else:
127 tok = source.next()
128 if value.isdigit() and tok.isdigit():
129 assert source.next() == 'R'
130 value = self.readindirect(value, tok)
131 tok = source.next()
132 result[key] = value
133
134 return result
135
136 def readxref(fdata):
137 startloc = fdata.rindex('startxref')
138 xrefinfo = list(PdfTokens(fdata, startloc, False))
139 assert len(xrefinfo) == 3, xrefinfo
140 assert xrefinfo[0] == 'startxref', xrefinfo[0]
141 assert xrefinfo[1].isdigit(), xrefinfo[1]
142 assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2])
143 return startloc, PdfTokens(fdata, int(xrefinfo[1]))
144 readxref = staticmethod(readxref)
145
146 def parsexref(self, source):
147 tok = source.next()
148 assert tok == 'xref', tok
149 while 1:
150 tok = source.next()
151 if tok == 'trailer':
152 break
153 startobj = int(tok)
154 for objnum in range(startobj, startobj + int(source.next())):
155 offset = int(source.next())
156 generation = int(source.next())
157 if source.next() == 'n':
158 objid = self.fdata, objnum, generation
159 objval = [offset, self.unresolved]
160 self.indirect_objects.setdefault(objid, objval)
161
162 pagename = PdfName.Page
163 pagesname = PdfName.Pages
164
165 def readpages(self, node):
166 # PDFs can have arbitrarily nested Pages/Page
167 # dictionary structures.
168 if node.Type == self.pagename:
169 return [node]
170 assert node.Type == self.pagesname, node.Type
171 result = []
172 for node in node.Kids:
173 result.extend(self.readpages(node))
174 return result
175
176 def __init__(self, fname=None, fdata=None, decompress=True):
177
178 if fname is not None:
179 assert fdata is None
180 # Allow reading preexisting streams like pyPdf
181 if hasattr(fname, 'read'):
182 fdata = fname.read()
183 else:
184 f = open(fname, 'rb')
185 fdata = f.read()
186 f.close()
187
188 assert fdata is not None
189 fdata = fdata.rstrip('\00')
190 self.private.fdata = fdata
191
192 self.private.indirect_objects = {}
193 self.private.special = {'<<': self.readdict, '[': self.readarray}
194
195 startloc, source = self.readxref(fdata)
196 self.parsexref(source)
197 assert source.next() == '<<'
198 self.update(self.readdict(source))
199 assert source.next() == 'startxref' and source.floc > startloc
200 self.private.pages = self.readpages(self.Root.Pages)
201 if decompress:
202 self.uncompress()
203
204 # For compatibility with pyPdf
205 self.private.numPages = len(self.pages)
206
207
208 # For compatibility with pyPdf
209 def getPage(self, pagenum):
210 return self.pages[pagenum]
211
212 def uncompress(self):
213 uncompress([x[1] for x in self.indirect_objects.itervalues()])
diff --git a/lib/pdfrw/pdftokens.py b/lib/pdfrw/pdftokens.py
deleted file mode 100644
index 04bd559..0000000
--- a/lib/pdfrw/pdftokens.py
+++ /dev/null
@@ -1,249 +0,0 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6A tokenizer for PDF streams.
7
8In general, documentation used was "PDF reference",
9sixth edition, for PDF version 1.7, dated November 2006.
10
11'''
12
13from __future__ import generators
14
15try:
16 set
17except NameError:
18 from sets import Set as set
19
20import re
21from pdfobjects import PdfString, PdfObject
22
23class _PrimitiveTokens(object):
24
25 # Table 3.1, page 50 of reference, defines whitespace
26 whitespaceset = set('\x00\t\n\f\r ')
27
28
29 # Text on page 50 defines delimiter characters
30 delimiterset = set('()<>{}[]/%')
31
32 # Coalesce contiguous whitespace into a single token
33 whitespace_pattern = '[%s]+' % ''.join(whitespaceset)
34
35 # In addition to the delimiters, we also use '\', which
36 # is special in some contexts in PDF.
37 delimiter_pattern = '\\\\|\\' + '|\\'.join(delimiterset)
38
39 # Dictionary delimiters are '<<' and '>>'. Look for
40 # these before the single variety.
41 dictdelim_pattern = r'\<\<|\>\>'
42
43 pattern = '(%s|%s|%s)' % (whitespace_pattern,
44 dictdelim_pattern, delimiter_pattern)
45 re_func = re.compile(pattern).finditer
46 del whitespace_pattern, dictdelim_pattern
47 del delimiter_pattern, pattern
48
49 def __init__(self, fdata):
50
51 class MyIterator(object):
52 def next():
53 if not tokens:
54 startloc = self.startloc
55 for match in next_match[0]:
56 start = match.start()
57 end = match.end()
58 tappend(fdata[start:end])
59 if start > startloc:
60 tappend(fdata[startloc:start])
61 self.startloc = end
62 break
63 else:
64 s = fdata[startloc:]
65 self.startloc = len(fdata)
66 if s:
67 tappend(s)
68 if not tokens:
69 raise StopIteration
70 return tpop()
71 next = staticmethod(next)
72
73 self.fdata = fdata
74 self.tokens = tokens = []
75 self.iterator = iterator = MyIterator()
76 self.next = iterator.next
77 self.next_match = next_match = [None]
78 tappend = tokens.append
79 tpop = tokens.pop
80
81 def setstart(self, startloc):
82 self.startloc = startloc
83 self.next_match[0] = self.re_func(self.fdata, startloc)
84
85 def __iter__(self):
86 return self.iterator
87
88 def coalesce(self, result):
89 ''' This function coalesces tokens together up until
90 the next delimiter or whitespace.
91 All of the coalesced tokens will either be non-matches,
92 or will be a matched backslash. We distinguish the
93 non-matches by the fact that next() will have left
94 a following match inside self.tokens for the actual match.
95 '''
96 tokens = self.tokens
97 whitespace = self.whitespaceset
98
99 # Optimized path for usual case -- regular data (not a name string),
100 # with no escape character, and followed by whitespace.
101
102 if tokens:
103 token = tokens.pop()
104 if token != '\\':
105 if token[0] not in whitespace:
106 tokens.append(token)
107 return
108 result.append(token)
109
110 # Non-optimized path. Either start of a name string received,
111 # or we just had one escape.
112
113 for token in self:
114 if tokens:
115 result.append(token)
116 token = tokens.pop()
117 if token != '\\':
118 if token[0] not in whitespace:
119 tokens.append(token)
120 return
121 result.append(token)
122
123
124 def floc(self):
125 return self.startloc - sum([len(x) for x in self.tokens])
126
127class PdfTokens(object):
128
129 def __init__(self, fdata, startloc=0, strip_comments=True):
130
131 def comment(token):
132 tokens = [token]
133 for token in primitive:
134 tokens.append(token)
135 if token[0] in whitespaceset and ('\n' in token or '\r' in token):
136 break
137 return not strip_comments and ''.join(tokens)
138
139 def single(token):
140 return token
141
142 def regular_string(token):
143 def escaped():
144 escaped = False
145 i = -2
146 while tokens[i] == '\\':
147 escaped = not escaped
148 i -= 1
149 return escaped
150
151 tokens = [token]
152 nestlevel = 1
153 for token in primitive:
154 tokens.append(token)
155 if token in '()' and not escaped():
156 nestlevel += token == '(' or -1
157 if not nestlevel:
158 break
159 else:
160 assert 0, "Unexpected end of token stream"
161 return PdfString(''.join(tokens))
162
163 def hex_string(token):
164 tokens = [token]
165 for token in primitive:
166 tokens.append(token)
167 if token == '>':
168 break
169 while tokens[-2] == '>>':
170 tokens.append(tokens.pop(-2))
171 return PdfString(''.join(tokens))
172
173 def normal_data(token):
174
175 # Obscure optimization -- we can get here with
176 # whitespace or regular character data. If we get
177 # here with whitespace, then there won't be an additional
178 # token queued up in the primitive object, otherwise there
179 # will...
180 if primitive_tokens: #if token[0] not in whitespaceset:
181 tokens = [token]
182 primitive.coalesce(tokens)
183 return PdfObject(''.join(tokens))
184
185 def name_string(token):
186 tokens = [token]
187 primitive.coalesce(tokens)
188 token = ''.join(tokens)
189 if '#' in token:
190 substrs = token.split('#')
191 substrs.reverse()
192 tokens = [substrs.pop()]
193 while substrs:
194 s = substrs.pop()
195 tokens.append(chr(int(s[:2], 16)))
196 tokens.append(s[2:])
197 token = ''.join(tokens)
198 return PdfObject(token)
199
200 def broken(token):
201 assert 0, token
202
203 dispatch = {
204 '(': regular_string,
205 ')': broken,
206 '<': hex_string,
207 '>': broken,
208 '[': single,
209 ']': single,
210 '{': single,
211 '}': single,
212 '/': name_string,
213 '%' : comment,
214 '<<': single,
215 '>>': single,
216 }.get
217
218 class MyIterator(object):
219 def next():
220 while not tokens:
221 token = primitive_next()
222 token = dispatch(token, normal_data)(token)
223 if token:
224 return token
225 return tokens.pop()
226 next = staticmethod(next)
227
228 self.primitive = primitive = _PrimitiveTokens(fdata)
229 self.setstart = primitive.setstart
230 primitive.setstart(startloc)
231 self.fdata = fdata
232 self.strip_comments = strip_comments
233 self.tokens = tokens = []
234 self.iterator = iterator = MyIterator()
235 self.next = iterator.next
236 primitive_next = primitive.next
237 primitive_tokens = primitive.tokens
238 whitespaceset = _PrimitiveTokens.whitespaceset
239
240 def floc(self):
241 return self.primitive.floc() - sum([len(x) for x in self.tokens])
242 floc = property(floc)
243
244 def __iter__(self):
245 return self.iterator
246
247 def multiple(self, count):
248 next = self.next
249 return [next() for i in range(count)]
diff --git a/lib/pdfrw/pdfwriter.py b/lib/pdfrw/pdfwriter.py
deleted file mode 100644
index c193843..0000000
--- a/lib/pdfrw/pdfwriter.py
+++ /dev/null
@@ -1,234 +0,0 @@
1#!/usr/bin/env python
2
3# A part of pdfrw (pdfrw.googlecode.com)
4# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
5# MIT license -- See LICENSE.txt for details
6
7'''
8The PdfWriter class writes an entire PDF file out to disk.
9
10The writing process is not at all optimized or organized.
11
12An instance of the PdfWriter class has two methods:
13 addpage(page)
14and
15 write(fname)
16
17addpage() assumes that the pages are part of a valid
18tree/forest of PDF objects.
19'''
20
21try:
22 set
23except NameError:
24 from sets import Set as set
25
26from pdfobjects import PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfString
27from pdfcompress import compress
28
29debug = False
30
31class FormatObjects(object):
32 ''' FormatObjects performs the actual formatting and disk write.
33 '''
34
35 def add(self, obj, visited):
36 ''' Add an object to our list, if it's an indirect
37 object. Just format it if not.
38 '''
39 # Can't hash dicts, so just hash the object ID
40 objid = id(obj)
41
42 # Automatically set stream objects to indirect
43 if isinstance(obj, PdfDict):
44 indirect = obj.indirect or (obj.stream is not None)
45 else:
46 indirect = getattr(obj, 'indirect', False)
47
48 if not indirect:
49 assert objid not in visited, \
50 'Circular reference encountered in non-indirect object %s' % repr(obj)
51 visited.add(objid)
52 result = self.format_obj(obj, visited)
53 visited.remove(objid)
54 return result
55
56 objnum = self.indirect_dict.get(objid)
57
58 # If we haven't seen the object yet, we need to
59 # add it to the indirect object list.
60 if objnum is None:
61 objlist = self.objlist
62 objnum = len(objlist) + 1
63 if debug:
64 print ' Object', objnum, '\r',
65 objlist.append(None)
66 self.indirect_dict[objid] = objnum
67 objlist[objnum-1] = self.format_obj(obj)
68 return '%s 0 R' % objnum
69
70 def format_array(myarray, formatter):
71 # Format array data into semi-readable ASCII
72 if sum([len(x) for x in myarray]) <= 70:
73 return formatter % ' '.join(myarray)
74 bigarray = []
75 count = 1000000
76 for x in myarray:
77 lenx = len(x)
78 if lenx + count > 70:
79 subarray = []
80 bigarray.append(subarray)
81 count = 0
82 count += lenx + 1
83 subarray.append(x)
84 return formatter % '\n '.join([' '.join(x) for x in bigarray])
85 format_array = staticmethod(format_array)
86
87 def format_obj(self, obj, visited=None):
88 ''' format PDF object data into semi-readable ASCII.
89 May mutually recurse with add() -- add() will
90 return references for indirect objects, and add
91 the indirect object to the list.
92 '''
93 if visited is None:
94 visited = set()
95 if isinstance(obj, PdfArray):
96 myarray = [self.add(x, visited) for x in obj]
97 return self.format_array(myarray, '[%s]')
98 elif isinstance(obj, PdfDict):
99 if self.compress and obj.stream:
100 compress([obj])
101 myarray = []
102 # Jython 2.2.1 has a bug which segfaults when
103 # sorting subclassed strings, so we un-subclass them.
104 dictkeys = [str(x) for x in obj.iterkeys()]
105 dictkeys.sort()
106 for key in dictkeys:
107 myarray.append(key)
108 myarray.append(self.add(obj[key], visited))
109 result = self.format_array(myarray, '<<%s>>')
110 stream = obj.stream
111 if stream is not None:
112 result = '%s\nstream\n%s\nendstream' % (result, stream)
113 return result
114 elif isinstance(obj, basestring) and not hasattr(obj, 'indirect'):
115 return PdfString.encode(obj)
116 else:
117 return str(obj)
118
119 def dump(cls, f, trailer, version='1.3', compress=True):
120 self = cls()
121 self.compress = compress
122 self.indirect_dict = {}
123 self.objlist = []
124
125 # The first format of trailer gets all the information,
126 # but we throw away the actual trailer formatting.
127 self.format_obj(trailer)
128 # Now we know the size, so we update the trailer dict
129 # and get the formatted data.
130 trailer.Size = PdfObject(len(self.objlist) + 1)
131 trailer = self.format_obj(trailer)
132
133 # Now we have all the pieces to write out to the file.
134 # Keep careful track of the counts while we do it so
135 # we can correctly build the cross-reference.
136
137 header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version
138 f.write(header)
139 offset = len(header)
140 offsets = [(0, 65535, 'f')]
141
142 for i, x in enumerate(self.objlist):
143 objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x)
144 offsets.append((offset, 0, 'n'))
145 offset += len(objstr)
146 f.write(objstr)
147
148 f.write('xref\n0 %s\n' % len(offsets))
149 for x in offsets:
150 f.write('%010d %05d %s\r\n' % x)
151 f.write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset))
152 dump = classmethod(dump)
153
154class PdfWriter(object):
155
156 _trailer = None
157
158 def __init__(self, version='1.3', compress=True):
159 self.pagearray = PdfArray()
160 self.compress = compress
161 self.version = version
162
163 def addpage(self, page):
164 self._trailer = None
165 assert page.Type == PdfName.Page
166 inheritable = page.inheritable # searches for resources
167 self.pagearray.append(
168 IndirectPdfDict(
169 page,
170 Resources = inheritable.Resources,
171 MediaBox = inheritable.MediaBox,
172 CropBox = inheritable.CropBox,
173 Rotate = inheritable.Rotate,
174 )
175 )
176 return self
177
178 addPage = addpage # for compatibility with pyPdf
179
180 def addpages(self, pagelist):
181 for page in pagelist:
182 self.addpage(page)
183 return self
184
185 def _get_trailer(self):
186 trailer = self._trailer
187 if trailer is not None:
188 return trailer
189
190 # Create the basic object structure of the PDF file
191 trailer = PdfDict(
192 Root = IndirectPdfDict(
193 Type = PdfName.Catalog,
194 Pages = IndirectPdfDict(
195 Type = PdfName.Pages,
196 Count = PdfObject(len(self.pagearray)),
197 Kids = self.pagearray
198 )
199 )
200 )
201 # Make all the pages point back to the page dictionary
202 pagedict = trailer.Root.Pages
203 for page in pagedict.Kids:
204 page.Parent = pagedict
205 self._trailer = trailer
206 return trailer
207
208 def _set_trailer(self, trailer):
209 self._trailer = trailer
210
211 trailer = property(_get_trailer, _set_trailer)
212
213 def write(self, fname, trailer=None):
214 trailer = trailer or self.trailer
215
216 # Dump the data. We either have a filename or a preexisting
217 # file object.
218 preexisting = hasattr(fname, 'write')
219 f = preexisting and fname or open(fname, 'wb')
220 FormatObjects.dump(f, trailer, self.version, self.compress)
221 if not preexisting:
222 f.close()
223
224if __name__ == '__main__':
225 debug = True
226 import pdfreader
227 x = pdfreader.PdfReader('source.pdf')
228 y = PdfWriter()
229 for i, page in enumerate(x.pages):
230 print ' Adding page', i+1, '\r',
231 y.addpage(page)
232 print
233 y.write('result.pdf')
234 print