diff options
Diffstat (limited to 'lib/pdfrw/pdfwriter.py')
| -rw-r--r-- | lib/pdfrw/pdfwriter.py | 234 |
1 files changed, 0 insertions, 234 deletions
diff --git a/lib/pdfrw/pdfwriter.py b/lib/pdfrw/pdfwriter.py deleted file mode 100644 index c193843..0000000 --- a/lib/pdfrw/pdfwriter.py +++ /dev/null | |||
| @@ -1,234 +0,0 @@ | |||
| 1 | #!/usr/bin/env python | ||
| 2 | |||
| 3 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 4 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 5 | # MIT license -- See LICENSE.txt for details | ||
| 6 | |||
| 7 | ''' | ||
| 8 | The PdfWriter class writes an entire PDF file out to disk. | ||
| 9 | |||
| 10 | The writing process is not at all optimized or organized. | ||
| 11 | |||
| 12 | An instance of the PdfWriter class has two methods: | ||
| 13 | addpage(page) | ||
| 14 | and | ||
| 15 | write(fname) | ||
| 16 | |||
| 17 | addpage() assumes that the pages are part of a valid | ||
| 18 | tree/forest of PDF objects. | ||
| 19 | ''' | ||
| 20 | |||
| 21 | try: | ||
| 22 | set | ||
| 23 | except NameError: | ||
| 24 | from sets import Set as set | ||
| 25 | |||
| 26 | from pdfobjects import PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfString | ||
| 27 | from pdfcompress import compress | ||
| 28 | |||
| 29 | debug = False | ||
| 30 | |||
| 31 | class FormatObjects(object): | ||
| 32 | ''' FormatObjects performs the actual formatting and disk write. | ||
| 33 | ''' | ||
| 34 | |||
| 35 | def add(self, obj, visited): | ||
| 36 | ''' Add an object to our list, if it's an indirect | ||
| 37 | object. Just format it if not. | ||
| 38 | ''' | ||
| 39 | # Can't hash dicts, so just hash the object ID | ||
| 40 | objid = id(obj) | ||
| 41 | |||
| 42 | # Automatically set stream objects to indirect | ||
| 43 | if isinstance(obj, PdfDict): | ||
| 44 | indirect = obj.indirect or (obj.stream is not None) | ||
| 45 | else: | ||
| 46 | indirect = getattr(obj, 'indirect', False) | ||
| 47 | |||
| 48 | if not indirect: | ||
| 49 | assert objid not in visited, \ | ||
| 50 | 'Circular reference encountered in non-indirect object %s' % repr(obj) | ||
| 51 | visited.add(objid) | ||
| 52 | result = self.format_obj(obj, visited) | ||
| 53 | visited.remove(objid) | ||
| 54 | return result | ||
| 55 | |||
| 56 | objnum = self.indirect_dict.get(objid) | ||
| 57 | |||
| 58 | # If we haven't seen the object yet, we need to | ||
| 59 | # add it to the indirect object list. | ||
| 60 | if objnum is None: | ||
| 61 | objlist = self.objlist | ||
| 62 | objnum = len(objlist) + 1 | ||
| 63 | if debug: | ||
| 64 | print ' Object', objnum, '\r', | ||
| 65 | objlist.append(None) | ||
| 66 | self.indirect_dict[objid] = objnum | ||
| 67 | objlist[objnum-1] = self.format_obj(obj) | ||
| 68 | return '%s 0 R' % objnum | ||
| 69 | |||
| 70 | def format_array(myarray, formatter): | ||
| 71 | # Format array data into semi-readable ASCII | ||
| 72 | if sum([len(x) for x in myarray]) <= 70: | ||
| 73 | return formatter % ' '.join(myarray) | ||
| 74 | bigarray = [] | ||
| 75 | count = 1000000 | ||
| 76 | for x in myarray: | ||
| 77 | lenx = len(x) | ||
| 78 | if lenx + count > 70: | ||
| 79 | subarray = [] | ||
| 80 | bigarray.append(subarray) | ||
| 81 | count = 0 | ||
| 82 | count += lenx + 1 | ||
| 83 | subarray.append(x) | ||
| 84 | return formatter % '\n '.join([' '.join(x) for x in bigarray]) | ||
| 85 | format_array = staticmethod(format_array) | ||
| 86 | |||
| 87 | def format_obj(self, obj, visited=None): | ||
| 88 | ''' format PDF object data into semi-readable ASCII. | ||
| 89 | May mutually recurse with add() -- add() will | ||
| 90 | return references for indirect objects, and add | ||
| 91 | the indirect object to the list. | ||
| 92 | ''' | ||
| 93 | if visited is None: | ||
| 94 | visited = set() | ||
| 95 | if isinstance(obj, PdfArray): | ||
| 96 | myarray = [self.add(x, visited) for x in obj] | ||
| 97 | return self.format_array(myarray, '[%s]') | ||
| 98 | elif isinstance(obj, PdfDict): | ||
| 99 | if self.compress and obj.stream: | ||
| 100 | compress([obj]) | ||
| 101 | myarray = [] | ||
| 102 | # Jython 2.2.1 has a bug which segfaults when | ||
| 103 | # sorting subclassed strings, so we un-subclass them. | ||
| 104 | dictkeys = [str(x) for x in obj.iterkeys()] | ||
| 105 | dictkeys.sort() | ||
| 106 | for key in dictkeys: | ||
| 107 | myarray.append(key) | ||
| 108 | myarray.append(self.add(obj[key], visited)) | ||
| 109 | result = self.format_array(myarray, '<<%s>>') | ||
| 110 | stream = obj.stream | ||
| 111 | if stream is not None: | ||
| 112 | result = '%s\nstream\n%s\nendstream' % (result, stream) | ||
| 113 | return result | ||
| 114 | elif isinstance(obj, basestring) and not hasattr(obj, 'indirect'): | ||
| 115 | return PdfString.encode(obj) | ||
| 116 | else: | ||
| 117 | return str(obj) | ||
| 118 | |||
| 119 | def dump(cls, f, trailer, version='1.3', compress=True): | ||
| 120 | self = cls() | ||
| 121 | self.compress = compress | ||
| 122 | self.indirect_dict = {} | ||
| 123 | self.objlist = [] | ||
| 124 | |||
| 125 | # The first format of trailer gets all the information, | ||
| 126 | # but we throw away the actual trailer formatting. | ||
| 127 | self.format_obj(trailer) | ||
| 128 | # Now we know the size, so we update the trailer dict | ||
| 129 | # and get the formatted data. | ||
| 130 | trailer.Size = PdfObject(len(self.objlist) + 1) | ||
| 131 | trailer = self.format_obj(trailer) | ||
| 132 | |||
| 133 | # Now we have all the pieces to write out to the file. | ||
| 134 | # Keep careful track of the counts while we do it so | ||
| 135 | # we can correctly build the cross-reference. | ||
| 136 | |||
| 137 | header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version | ||
| 138 | f.write(header) | ||
| 139 | offset = len(header) | ||
| 140 | offsets = [(0, 65535, 'f')] | ||
| 141 | |||
| 142 | for i, x in enumerate(self.objlist): | ||
| 143 | objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x) | ||
| 144 | offsets.append((offset, 0, 'n')) | ||
| 145 | offset += len(objstr) | ||
| 146 | f.write(objstr) | ||
| 147 | |||
| 148 | f.write('xref\n0 %s\n' % len(offsets)) | ||
| 149 | for x in offsets: | ||
| 150 | f.write('%010d %05d %s\r\n' % x) | ||
| 151 | f.write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset)) | ||
| 152 | dump = classmethod(dump) | ||
| 153 | |||
| 154 | class PdfWriter(object): | ||
| 155 | |||
| 156 | _trailer = None | ||
| 157 | |||
| 158 | def __init__(self, version='1.3', compress=True): | ||
| 159 | self.pagearray = PdfArray() | ||
| 160 | self.compress = compress | ||
| 161 | self.version = version | ||
| 162 | |||
| 163 | def addpage(self, page): | ||
| 164 | self._trailer = None | ||
| 165 | assert page.Type == PdfName.Page | ||
| 166 | inheritable = page.inheritable # searches for resources | ||
| 167 | self.pagearray.append( | ||
| 168 | IndirectPdfDict( | ||
| 169 | page, | ||
| 170 | Resources = inheritable.Resources, | ||
| 171 | MediaBox = inheritable.MediaBox, | ||
| 172 | CropBox = inheritable.CropBox, | ||
| 173 | Rotate = inheritable.Rotate, | ||
| 174 | ) | ||
| 175 | ) | ||
| 176 | return self | ||
| 177 | |||
| 178 | addPage = addpage # for compatibility with pyPdf | ||
| 179 | |||
| 180 | def addpages(self, pagelist): | ||
| 181 | for page in pagelist: | ||
| 182 | self.addpage(page) | ||
| 183 | return self | ||
| 184 | |||
| 185 | def _get_trailer(self): | ||
| 186 | trailer = self._trailer | ||
| 187 | if trailer is not None: | ||
| 188 | return trailer | ||
| 189 | |||
| 190 | # Create the basic object structure of the PDF file | ||
| 191 | trailer = PdfDict( | ||
| 192 | Root = IndirectPdfDict( | ||
| 193 | Type = PdfName.Catalog, | ||
| 194 | Pages = IndirectPdfDict( | ||
| 195 | Type = PdfName.Pages, | ||
| 196 | Count = PdfObject(len(self.pagearray)), | ||
| 197 | Kids = self.pagearray | ||
| 198 | ) | ||
| 199 | ) | ||
| 200 | ) | ||
| 201 | # Make all the pages point back to the page dictionary | ||
| 202 | pagedict = trailer.Root.Pages | ||
| 203 | for page in pagedict.Kids: | ||
| 204 | page.Parent = pagedict | ||
| 205 | self._trailer = trailer | ||
| 206 | return trailer | ||
| 207 | |||
| 208 | def _set_trailer(self, trailer): | ||
| 209 | self._trailer = trailer | ||
| 210 | |||
| 211 | trailer = property(_get_trailer, _set_trailer) | ||
| 212 | |||
| 213 | def write(self, fname, trailer=None): | ||
| 214 | trailer = trailer or self.trailer | ||
| 215 | |||
| 216 | # Dump the data. We either have a filename or a preexisting | ||
| 217 | # file object. | ||
| 218 | preexisting = hasattr(fname, 'write') | ||
| 219 | f = preexisting and fname or open(fname, 'wb') | ||
| 220 | FormatObjects.dump(f, trailer, self.version, self.compress) | ||
| 221 | if not preexisting: | ||
| 222 | f.close() | ||
| 223 | |||
| 224 | if __name__ == '__main__': | ||
| 225 | debug = True | ||
| 226 | import pdfreader | ||
| 227 | x = pdfreader.PdfReader('source.pdf') | ||
| 228 | y = PdfWriter() | ||
| 229 | for i, page in enumerate(x.pages): | ||
| 230 | print ' Adding page', i+1, '\r', | ||
| 231 | y.addpage(page) | ||
| 232 | |||
| 233 | y.write('result.pdf') | ||
| 234 | |||
