diff options
| author | jvoisin | 2011-08-16 18:11:24 +0200 |
|---|---|---|
| committer | jvoisin | 2011-08-16 18:11:24 +0200 |
| commit | 4bd3e47da02fde08acfada1795cc55170abdb00a (patch) | |
| tree | f8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /lib/pdfrw | |
| parent | baf8e080125614326ba9c96ca8f2404fd12b050e (diff) | |
setup.py now works !
Diffstat (limited to 'lib/pdfrw')
| -rw-r--r-- | lib/pdfrw/__init__.py | 14 | ||||
| -rw-r--r-- | lib/pdfrw/pdfcompress.py | 57 | ||||
| -rw-r--r-- | lib/pdfrw/pdfobjects.py | 183 | ||||
| -rw-r--r-- | lib/pdfrw/pdfreader.py | 213 | ||||
| -rw-r--r-- | lib/pdfrw/pdftokens.py | 249 | ||||
| -rw-r--r-- | lib/pdfrw/pdfwriter.py | 234 |
6 files changed, 0 insertions, 950 deletions
diff --git a/lib/pdfrw/__init__.py b/lib/pdfrw/__init__.py deleted file mode 100644 index 26e8c73..0000000 --- a/lib/pdfrw/__init__.py +++ /dev/null | |||
| @@ -1,14 +0,0 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | from pdfwriter import PdfWriter | ||
| 6 | from pdfreader import PdfReader | ||
| 7 | from pdfobjects import PdfObject, PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfString | ||
| 8 | from pdftokens import PdfTokens | ||
| 9 | |||
| 10 | # Add a tiny bit of compatibility to pyPdf | ||
| 11 | |||
| 12 | PdfFileReader = PdfReader | ||
| 13 | PdfFileWriter = PdfWriter | ||
| 14 | |||
diff --git a/lib/pdfrw/pdfcompress.py b/lib/pdfrw/pdfcompress.py deleted file mode 100644 index 1c11970..0000000 --- a/lib/pdfrw/pdfcompress.py +++ /dev/null | |||
| @@ -1,57 +0,0 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | Currently, this sad little file only knows how to decompress | ||
| 7 | using the flate (zlib) algorithm. Maybe more later, but it's | ||
| 8 | not a priority for me... | ||
| 9 | ''' | ||
| 10 | |||
| 11 | from __future__ import generators | ||
| 12 | |||
| 13 | try: | ||
| 14 | set | ||
| 15 | except NameError: | ||
| 16 | from sets import Set as set | ||
| 17 | |||
| 18 | import zlib | ||
| 19 | from pdfobjects import PdfDict, PdfName | ||
| 20 | |||
| 21 | |||
| 22 | def streamobjects(mylist): | ||
| 23 | for obj in mylist: | ||
| 24 | if isinstance(obj, PdfDict) and obj.stream is not None: | ||
| 25 | yield obj | ||
| 26 | |||
| 27 | def uncompress(mylist, warnings=set()): | ||
| 28 | flate = PdfName.FlateDecode | ||
| 29 | for obj in streamobjects(mylist): | ||
| 30 | ftype = obj.Filter | ||
| 31 | if ftype is None: | ||
| 32 | continue | ||
| 33 | if isinstance(ftype, list) and len(ftype) == 1: | ||
| 34 | # todo: multiple filters | ||
| 35 | ftype = ftype[0] | ||
| 36 | parms = obj.DecodeParms | ||
| 37 | if ftype != flate or parms is not None: | ||
| 38 | msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms)) | ||
| 39 | if msg not in warnings: | ||
| 40 | warnings.add(msg) | ||
| 41 | print msg | ||
| 42 | else: | ||
| 43 | obj.stream = zlib.decompress(obj.stream) | ||
| 44 | obj.Filter = None | ||
| 45 | |||
| 46 | def compress(mylist): | ||
| 47 | flate = PdfName.FlateDecode | ||
| 48 | for obj in streamobjects(mylist): | ||
| 49 | ftype = obj.Filter | ||
| 50 | if ftype is not None: | ||
| 51 | continue | ||
| 52 | oldstr = obj.stream | ||
| 53 | newstr = zlib.compress(oldstr) | ||
| 54 | if len(newstr) < len(oldstr) + 30: | ||
| 55 | obj.stream = newstr | ||
| 56 | obj.Filter = flate | ||
| 57 | obj.DecodeParms = None | ||
diff --git a/lib/pdfrw/pdfobjects.py b/lib/pdfrw/pdfobjects.py deleted file mode 100644 index 08ad825..0000000 --- a/lib/pdfrw/pdfobjects.py +++ /dev/null | |||
| @@ -1,183 +0,0 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | Objects that can occur in PDF files. The most important | ||
| 7 | objects are arrays and dicts. Either of these can be | ||
| 8 | indirect or not, and dicts could have an associated | ||
| 9 | stream. | ||
| 10 | ''' | ||
| 11 | from __future__ import generators | ||
| 12 | |||
| 13 | try: | ||
| 14 | set | ||
| 15 | except NameError: | ||
| 16 | from sets import Set as set | ||
| 17 | |||
| 18 | import re | ||
| 19 | |||
| 20 | class PdfObject(str): | ||
| 21 | indirect = False | ||
| 22 | |||
| 23 | class PdfArray(list): | ||
| 24 | indirect = False | ||
| 25 | |||
| 26 | class PdfName(object): | ||
| 27 | def __getattr__(self, name): | ||
| 28 | return self(name) | ||
| 29 | def __call__(self, name): | ||
| 30 | return PdfObject('/' + name) | ||
| 31 | |||
| 32 | PdfName = PdfName() | ||
| 33 | |||
| 34 | class PdfString(str): | ||
| 35 | indirect = False | ||
| 36 | unescape_dict = {'\\b':'\b', '\\f':'\f', '\\n':'\n', | ||
| 37 | '\\r':'\r', '\\t':'\t', | ||
| 38 | '\\\r\n': '', '\\\r':'', '\\\n':'', | ||
| 39 | '\\\\':'\\', '\\':'', | ||
| 40 | } | ||
| 41 | unescape_pattern = r'(\\b|\\f|\\n|\\r|\\t|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)' | ||
| 42 | unescape_func = re.compile(unescape_pattern).split | ||
| 43 | |||
| 44 | hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])' | ||
| 45 | hex_func = re.compile(hex_pattern).split | ||
| 46 | |||
| 47 | hex_pattern2 = '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])' | ||
| 48 | hex_func2 = re.compile(hex_pattern2).split | ||
| 49 | |||
| 50 | hex_funcs = hex_func, hex_func2 | ||
| 51 | |||
| 52 | indirect = False | ||
| 53 | |||
| 54 | def decode_regular(self, remap=chr): | ||
| 55 | assert self[0] == '(' and self[-1] == ')' | ||
| 56 | mylist = self.unescape_func(self[1:-1]) | ||
| 57 | result = [] | ||
| 58 | unescape = self.unescape_dict.get | ||
| 59 | for chunk in mylist: | ||
| 60 | chunk = unescape(chunk, chunk) | ||
| 61 | if chunk.startswith('\\') and len(chunk) > 1: | ||
| 62 | value = int(chunk[1:], 8) | ||
| 63 | # FIXME: TODO: Handle unicode here | ||
| 64 | if value > 127: | ||
| 65 | value = 127 | ||
| 66 | chunk = remap(value) | ||
| 67 | if chunk: | ||
| 68 | result.append(chunk) | ||
| 69 | return ''.join(result) | ||
| 70 | |||
| 71 | def decode_hex(self, remap=chr, twobytes=False): | ||
| 72 | data = ''.join(self.split()) | ||
| 73 | data = self.hex_funcs[twobytes](data) | ||
| 74 | chars = data[1::2] | ||
| 75 | other = data[0::2] | ||
| 76 | assert other[0] == '<' and other[-1] == '>' and ''.join(other) == '<>', self | ||
| 77 | return ''.join([remap(int(x, 16)) for x in chars]) | ||
| 78 | |||
| 79 | def decode(self, remap=chr, twobytes=False): | ||
| 80 | if self.startswith('('): | ||
| 81 | return self.decode_regular(remap) | ||
| 82 | |||
| 83 | else: | ||
| 84 | return self.decode_hex(remap, twobytes) | ||
| 85 | |||
| 86 | def encode(cls, source, usehex=False): | ||
| 87 | assert not usehex, "Not supported yet" | ||
| 88 | if isinstance(source, unicode): | ||
| 89 | source = source.encode('utf-8') | ||
| 90 | else: | ||
| 91 | source = str(source) | ||
| 92 | source = source.replace('\\', '\\\\') | ||
| 93 | source = source.replace('(', '\\(') | ||
| 94 | source = source.replace(')', '\\)') | ||
| 95 | return cls('(' +source + ')') | ||
| 96 | encode = classmethod(encode) | ||
| 97 | |||
| 98 | class PdfDict(dict): | ||
| 99 | indirect = False | ||
| 100 | stream = None | ||
| 101 | |||
| 102 | _special = dict(indirect = ('indirect', False), | ||
| 103 | stream = ('stream', True), | ||
| 104 | _stream = ('stream', False), | ||
| 105 | ) | ||
| 106 | |||
| 107 | def __setitem__(self, name, value): | ||
| 108 | assert name.startswith('/'), name | ||
| 109 | if value is not None: | ||
| 110 | dict.__setitem__(self, name, value) | ||
| 111 | elif name in self: | ||
| 112 | del self[name] | ||
| 113 | |||
| 114 | def __init__(self, *args, **kw): | ||
| 115 | if args: | ||
| 116 | if len(args) == 1: | ||
| 117 | args = args[0] | ||
| 118 | self.update(args) | ||
| 119 | if isinstance(args, PdfDict): | ||
| 120 | self.indirect = args.indirect | ||
| 121 | self._stream = args.stream | ||
| 122 | for key, value in kw.iteritems(): | ||
| 123 | setattr(self, key, value) | ||
| 124 | |||
| 125 | def __getattr__(self, name): | ||
| 126 | return self.get(PdfName(name)) | ||
| 127 | |||
| 128 | def __setattr__(self, name, value): | ||
| 129 | info = self._special.get(name) | ||
| 130 | if info is None: | ||
| 131 | self[PdfName(name)] = value | ||
| 132 | else: | ||
| 133 | name, setlen = info | ||
| 134 | self.__dict__[name] = value | ||
| 135 | if setlen: | ||
| 136 | notnone = value is not None | ||
| 137 | self.Length = notnone and PdfObject(len(value)) or None | ||
| 138 | |||
| 139 | def iteritems(self): | ||
| 140 | for key, value in dict.iteritems(self): | ||
| 141 | if value is not None: | ||
| 142 | assert key.startswith('/'), (key, value) | ||
| 143 | yield key, value | ||
| 144 | |||
| 145 | def inheritable(self): | ||
| 146 | ''' Search through ancestors as needed for inheritable | ||
| 147 | dictionary items | ||
| 148 | ''' | ||
| 149 | class Search(object): | ||
| 150 | def __init__(self, basedict): | ||
| 151 | self.basedict = basedict | ||
| 152 | def __getattr__(self, name): | ||
| 153 | return self[name] | ||
| 154 | def __getitem__(self, name): | ||
| 155 | visited = set() | ||
| 156 | mydict = self.basedict | ||
| 157 | while 1: | ||
| 158 | value = getattr(mydict, name) | ||
| 159 | if value is not None: | ||
| 160 | return value | ||
| 161 | myid = id(mydict) | ||
| 162 | assert myid not in visited | ||
| 163 | visited.add(myid) | ||
| 164 | mydict = mydict.Parent | ||
| 165 | if mydict is None: | ||
| 166 | return | ||
| 167 | return Search(self) | ||
| 168 | inheritable = property(inheritable) | ||
| 169 | |||
| 170 | def private(self): | ||
| 171 | ''' Allows setting private metadata for use in | ||
| 172 | processing (not sent to PDF file) | ||
| 173 | ''' | ||
| 174 | class Private(object): | ||
| 175 | pass | ||
| 176 | |||
| 177 | result = Private() | ||
| 178 | result.__dict__ = self.__dict__ | ||
| 179 | return result | ||
| 180 | private = property(private) | ||
| 181 | |||
| 182 | class IndirectPdfDict(PdfDict): | ||
| 183 | indirect = True | ||
diff --git a/lib/pdfrw/pdfreader.py b/lib/pdfrw/pdfreader.py deleted file mode 100644 index 6f57bea..0000000 --- a/lib/pdfrw/pdfreader.py +++ /dev/null | |||
| @@ -1,213 +0,0 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | The PdfReader class reads an entire PDF file into memory and | ||
| 7 | parses the top-level container objects. (It does not parse | ||
| 8 | into streams.) The object subclasses PdfDict, and the | ||
| 9 | document pages are stored in a list in the pages attribute | ||
| 10 | of the object. | ||
| 11 | ''' | ||
| 12 | |||
| 13 | from pdftokens import PdfTokens | ||
| 14 | from pdfobjects import PdfDict, PdfArray, PdfName | ||
| 15 | from pdfcompress import uncompress | ||
| 16 | |||
| 17 | class PdfReader(PdfDict): | ||
| 18 | |||
| 19 | class unresolved: | ||
| 20 | # Used as a placeholder until we have an object. | ||
| 21 | pass | ||
| 22 | |||
| 23 | def readindirect(self, objnum, gennum): | ||
| 24 | ''' Read an indirect object. If it has already | ||
| 25 | been read, return it from the cache. | ||
| 26 | ''' | ||
| 27 | |||
| 28 | def setobj(obj): | ||
| 29 | # Store the new object in the dictionary | ||
| 30 | # once we have its value | ||
| 31 | record[1] = obj | ||
| 32 | |||
| 33 | def ordinary(source, setobj, obj): | ||
| 34 | # Deal with an ordinary (non-array, non-dict) object | ||
| 35 | setobj(obj) | ||
| 36 | return obj | ||
| 37 | |||
| 38 | fdata, objnum, gennum = self.fdata, int(objnum), int(gennum) | ||
| 39 | record = self.indirect_objects[fdata, objnum, gennum] | ||
| 40 | if record[1] is not self.unresolved: | ||
| 41 | return record[1] | ||
| 42 | |||
| 43 | # Read the object header and validate it | ||
| 44 | source = PdfTokens(fdata, record[0]) | ||
| 45 | objid = source.multiple(3) | ||
| 46 | assert int(objid[0]) == objnum, objid | ||
| 47 | assert int(objid[1]) == gennum, objid | ||
| 48 | assert objid[2] == 'obj', objid | ||
| 49 | |||
| 50 | # Read the object, and call special code if it starts | ||
| 51 | # an array or dictionary | ||
| 52 | obj = source.next() | ||
| 53 | obj = self.special.get(obj, ordinary)(source, setobj, obj) | ||
| 54 | self.readstream(obj, source) | ||
| 55 | obj.indirect = True | ||
| 56 | return obj | ||
| 57 | |||
| 58 | def readstream(obj, source): | ||
| 59 | ''' Read optional stream following a dictionary | ||
| 60 | object. | ||
| 61 | ''' | ||
| 62 | tok = source.next() | ||
| 63 | if tok == 'endobj': | ||
| 64 | return # No stream | ||
| 65 | |||
| 66 | assert isinstance(obj, PdfDict) | ||
| 67 | assert tok == 'stream', tok | ||
| 68 | fdata = source.fdata | ||
| 69 | floc = fdata.rindex(tok, 0, source.floc) + len(tok) | ||
| 70 | ch = fdata[floc] | ||
| 71 | if ch == '\r': | ||
| 72 | floc += 1 | ||
| 73 | ch = fdata[floc] | ||
| 74 | assert ch == '\n' | ||
| 75 | startstream = floc + 1 | ||
| 76 | endstream = startstream + int(obj.Length) | ||
| 77 | obj._stream = fdata[startstream:endstream] | ||
| 78 | source = PdfTokens(fdata, endstream) | ||
| 79 | endit = source.multiple(2) | ||
| 80 | if endit != 'endstream endobj'.split(): | ||
| 81 | # /Length attribute is broken, try to read stream | ||
| 82 | # anyway disregarding the specified value | ||
| 83 | # TODO: issue warning here once we have some kind of | ||
| 84 | # logging | ||
| 85 | endstream = fdata.index('endstream', startstream) | ||
| 86 | if fdata[endstream-2:endstream] == '\r\n': | ||
| 87 | endstream -= 2 | ||
| 88 | elif fdata[endstream-1] in ['\n', '\r']: | ||
| 89 | endstream -= 1 | ||
| 90 | source = PdfTokens(fdata, endstream) | ||
| 91 | endit = source.multiple(2) | ||
| 92 | assert endit == 'endstream endobj'.split() | ||
| 93 | obj.Length = str(endstream-startstream) | ||
| 94 | obj._stream = fdata[startstream:endstream] | ||
| 95 | readstream = staticmethod(readstream) | ||
| 96 | |||
| 97 | def readarray(self, source, setobj=lambda x:None, original=None): | ||
| 98 | special = self.special | ||
| 99 | result = PdfArray() | ||
| 100 | setobj(result) | ||
| 101 | |||
| 102 | for value in source: | ||
| 103 | if value == ']': | ||
| 104 | break | ||
| 105 | if value in special: | ||
| 106 | value = special[value](source) | ||
| 107 | elif value == 'R': | ||
| 108 | generation = result.pop() | ||
| 109 | value = self.readindirect(result.pop(), generation) | ||
| 110 | result.append(value) | ||
| 111 | return result | ||
| 112 | |||
| 113 | def readdict(self, source, setobj=lambda x:None, original=None): | ||
| 114 | special = self.special | ||
| 115 | result = PdfDict() | ||
| 116 | setobj(result) | ||
| 117 | |||
| 118 | tok = source.next() | ||
| 119 | while tok != '>>': | ||
| 120 | assert tok.startswith('/'), (tok, source.multiple(10)) | ||
| 121 | key = tok | ||
| 122 | value = source.next() | ||
| 123 | if value in special: | ||
| 124 | value = special[value](source) | ||
| 125 | tok = source.next() | ||
| 126 | else: | ||
| 127 | tok = source.next() | ||
| 128 | if value.isdigit() and tok.isdigit(): | ||
| 129 | assert source.next() == 'R' | ||
| 130 | value = self.readindirect(value, tok) | ||
| 131 | tok = source.next() | ||
| 132 | result[key] = value | ||
| 133 | |||
| 134 | return result | ||
| 135 | |||
| 136 | def readxref(fdata): | ||
| 137 | startloc = fdata.rindex('startxref') | ||
| 138 | xrefinfo = list(PdfTokens(fdata, startloc, False)) | ||
| 139 | assert len(xrefinfo) == 3, xrefinfo | ||
| 140 | assert xrefinfo[0] == 'startxref', xrefinfo[0] | ||
| 141 | assert xrefinfo[1].isdigit(), xrefinfo[1] | ||
| 142 | assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2]) | ||
| 143 | return startloc, PdfTokens(fdata, int(xrefinfo[1])) | ||
| 144 | readxref = staticmethod(readxref) | ||
| 145 | |||
| 146 | def parsexref(self, source): | ||
| 147 | tok = source.next() | ||
| 148 | assert tok == 'xref', tok | ||
| 149 | while 1: | ||
| 150 | tok = source.next() | ||
| 151 | if tok == 'trailer': | ||
| 152 | break | ||
| 153 | startobj = int(tok) | ||
| 154 | for objnum in range(startobj, startobj + int(source.next())): | ||
| 155 | offset = int(source.next()) | ||
| 156 | generation = int(source.next()) | ||
| 157 | if source.next() == 'n': | ||
| 158 | objid = self.fdata, objnum, generation | ||
| 159 | objval = [offset, self.unresolved] | ||
| 160 | self.indirect_objects.setdefault(objid, objval) | ||
| 161 | |||
| 162 | pagename = PdfName.Page | ||
| 163 | pagesname = PdfName.Pages | ||
| 164 | |||
| 165 | def readpages(self, node): | ||
| 166 | # PDFs can have arbitrarily nested Pages/Page | ||
| 167 | # dictionary structures. | ||
| 168 | if node.Type == self.pagename: | ||
| 169 | return [node] | ||
| 170 | assert node.Type == self.pagesname, node.Type | ||
| 171 | result = [] | ||
| 172 | for node in node.Kids: | ||
| 173 | result.extend(self.readpages(node)) | ||
| 174 | return result | ||
| 175 | |||
| 176 | def __init__(self, fname=None, fdata=None, decompress=True): | ||
| 177 | |||
| 178 | if fname is not None: | ||
| 179 | assert fdata is None | ||
| 180 | # Allow reading preexisting streams like pyPdf | ||
| 181 | if hasattr(fname, 'read'): | ||
| 182 | fdata = fname.read() | ||
| 183 | else: | ||
| 184 | f = open(fname, 'rb') | ||
| 185 | fdata = f.read() | ||
| 186 | f.close() | ||
| 187 | |||
| 188 | assert fdata is not None | ||
| 189 | fdata = fdata.rstrip('\00') | ||
| 190 | self.private.fdata = fdata | ||
| 191 | |||
| 192 | self.private.indirect_objects = {} | ||
| 193 | self.private.special = {'<<': self.readdict, '[': self.readarray} | ||
| 194 | |||
| 195 | startloc, source = self.readxref(fdata) | ||
| 196 | self.parsexref(source) | ||
| 197 | assert source.next() == '<<' | ||
| 198 | self.update(self.readdict(source)) | ||
| 199 | assert source.next() == 'startxref' and source.floc > startloc | ||
| 200 | self.private.pages = self.readpages(self.Root.Pages) | ||
| 201 | if decompress: | ||
| 202 | self.uncompress() | ||
| 203 | |||
| 204 | # For compatibility with pyPdf | ||
| 205 | self.private.numPages = len(self.pages) | ||
| 206 | |||
| 207 | |||
| 208 | # For compatibility with pyPdf | ||
| 209 | def getPage(self, pagenum): | ||
| 210 | return self.pages[pagenum] | ||
| 211 | |||
| 212 | def uncompress(self): | ||
| 213 | uncompress([x[1] for x in self.indirect_objects.itervalues()]) | ||
diff --git a/lib/pdfrw/pdftokens.py b/lib/pdfrw/pdftokens.py deleted file mode 100644 index 04bd559..0000000 --- a/lib/pdfrw/pdftokens.py +++ /dev/null | |||
| @@ -1,249 +0,0 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | A tokenizer for PDF streams. | ||
| 7 | |||
| 8 | In general, documentation used was "PDF reference", | ||
| 9 | sixth edition, for PDF version 1.7, dated November 2006. | ||
| 10 | |||
| 11 | ''' | ||
| 12 | |||
| 13 | from __future__ import generators | ||
| 14 | |||
| 15 | try: | ||
| 16 | set | ||
| 17 | except NameError: | ||
| 18 | from sets import Set as set | ||
| 19 | |||
| 20 | import re | ||
| 21 | from pdfobjects import PdfString, PdfObject | ||
| 22 | |||
| 23 | class _PrimitiveTokens(object): | ||
| 24 | |||
| 25 | # Table 3.1, page 50 of reference, defines whitespace | ||
| 26 | whitespaceset = set('\x00\t\n\f\r ') | ||
| 27 | |||
| 28 | |||
| 29 | # Text on page 50 defines delimiter characters | ||
| 30 | delimiterset = set('()<>{}[]/%') | ||
| 31 | |||
| 32 | # Coalesce contiguous whitespace into a single token | ||
| 33 | whitespace_pattern = '[%s]+' % ''.join(whitespaceset) | ||
| 34 | |||
| 35 | # In addition to the delimiters, we also use '\', which | ||
| 36 | # is special in some contexts in PDF. | ||
| 37 | delimiter_pattern = '\\\\|\\' + '|\\'.join(delimiterset) | ||
| 38 | |||
| 39 | # Dictionary delimiters are '<<' and '>>'. Look for | ||
| 40 | # these before the single variety. | ||
| 41 | dictdelim_pattern = r'\<\<|\>\>' | ||
| 42 | |||
| 43 | pattern = '(%s|%s|%s)' % (whitespace_pattern, | ||
| 44 | dictdelim_pattern, delimiter_pattern) | ||
| 45 | re_func = re.compile(pattern).finditer | ||
| 46 | del whitespace_pattern, dictdelim_pattern | ||
| 47 | del delimiter_pattern, pattern | ||
| 48 | |||
| 49 | def __init__(self, fdata): | ||
| 50 | |||
| 51 | class MyIterator(object): | ||
| 52 | def next(): | ||
| 53 | if not tokens: | ||
| 54 | startloc = self.startloc | ||
| 55 | for match in next_match[0]: | ||
| 56 | start = match.start() | ||
| 57 | end = match.end() | ||
| 58 | tappend(fdata[start:end]) | ||
| 59 | if start > startloc: | ||
| 60 | tappend(fdata[startloc:start]) | ||
| 61 | self.startloc = end | ||
| 62 | break | ||
| 63 | else: | ||
| 64 | s = fdata[startloc:] | ||
| 65 | self.startloc = len(fdata) | ||
| 66 | if s: | ||
| 67 | tappend(s) | ||
| 68 | if not tokens: | ||
| 69 | raise StopIteration | ||
| 70 | return tpop() | ||
| 71 | next = staticmethod(next) | ||
| 72 | |||
| 73 | self.fdata = fdata | ||
| 74 | self.tokens = tokens = [] | ||
| 75 | self.iterator = iterator = MyIterator() | ||
| 76 | self.next = iterator.next | ||
| 77 | self.next_match = next_match = [None] | ||
| 78 | tappend = tokens.append | ||
| 79 | tpop = tokens.pop | ||
| 80 | |||
| 81 | def setstart(self, startloc): | ||
| 82 | self.startloc = startloc | ||
| 83 | self.next_match[0] = self.re_func(self.fdata, startloc) | ||
| 84 | |||
| 85 | def __iter__(self): | ||
| 86 | return self.iterator | ||
| 87 | |||
| 88 | def coalesce(self, result): | ||
| 89 | ''' This function coalesces tokens together up until | ||
| 90 | the next delimiter or whitespace. | ||
| 91 | All of the coalesced tokens will either be non-matches, | ||
| 92 | or will be a matched backslash. We distinguish the | ||
| 93 | non-matches by the fact that next() will have left | ||
| 94 | a following match inside self.tokens for the actual match. | ||
| 95 | ''' | ||
| 96 | tokens = self.tokens | ||
| 97 | whitespace = self.whitespaceset | ||
| 98 | |||
| 99 | # Optimized path for usual case -- regular data (not a name string), | ||
| 100 | # with no escape character, and followed by whitespace. | ||
| 101 | |||
| 102 | if tokens: | ||
| 103 | token = tokens.pop() | ||
| 104 | if token != '\\': | ||
| 105 | if token[0] not in whitespace: | ||
| 106 | tokens.append(token) | ||
| 107 | return | ||
| 108 | result.append(token) | ||
| 109 | |||
| 110 | # Non-optimized path. Either start of a name string received, | ||
| 111 | # or we just had one escape. | ||
| 112 | |||
| 113 | for token in self: | ||
| 114 | if tokens: | ||
| 115 | result.append(token) | ||
| 116 | token = tokens.pop() | ||
| 117 | if token != '\\': | ||
| 118 | if token[0] not in whitespace: | ||
| 119 | tokens.append(token) | ||
| 120 | return | ||
| 121 | result.append(token) | ||
| 122 | |||
| 123 | |||
| 124 | def floc(self): | ||
| 125 | return self.startloc - sum([len(x) for x in self.tokens]) | ||
| 126 | |||
| 127 | class PdfTokens(object): | ||
| 128 | |||
| 129 | def __init__(self, fdata, startloc=0, strip_comments=True): | ||
| 130 | |||
| 131 | def comment(token): | ||
| 132 | tokens = [token] | ||
| 133 | for token in primitive: | ||
| 134 | tokens.append(token) | ||
| 135 | if token[0] in whitespaceset and ('\n' in token or '\r' in token): | ||
| 136 | break | ||
| 137 | return not strip_comments and ''.join(tokens) | ||
| 138 | |||
| 139 | def single(token): | ||
| 140 | return token | ||
| 141 | |||
| 142 | def regular_string(token): | ||
| 143 | def escaped(): | ||
| 144 | escaped = False | ||
| 145 | i = -2 | ||
| 146 | while tokens[i] == '\\': | ||
| 147 | escaped = not escaped | ||
| 148 | i -= 1 | ||
| 149 | return escaped | ||
| 150 | |||
| 151 | tokens = [token] | ||
| 152 | nestlevel = 1 | ||
| 153 | for token in primitive: | ||
| 154 | tokens.append(token) | ||
| 155 | if token in '()' and not escaped(): | ||
| 156 | nestlevel += token == '(' or -1 | ||
| 157 | if not nestlevel: | ||
| 158 | break | ||
| 159 | else: | ||
| 160 | assert 0, "Unexpected end of token stream" | ||
| 161 | return PdfString(''.join(tokens)) | ||
| 162 | |||
| 163 | def hex_string(token): | ||
| 164 | tokens = [token] | ||
| 165 | for token in primitive: | ||
| 166 | tokens.append(token) | ||
| 167 | if token == '>': | ||
| 168 | break | ||
| 169 | while tokens[-2] == '>>': | ||
| 170 | tokens.append(tokens.pop(-2)) | ||
| 171 | return PdfString(''.join(tokens)) | ||
| 172 | |||
| 173 | def normal_data(token): | ||
| 174 | |||
| 175 | # Obscure optimization -- we can get here with | ||
| 176 | # whitespace or regular character data. If we get | ||
| 177 | # here with whitespace, then there won't be an additional | ||
| 178 | # token queued up in the primitive object, otherwise there | ||
| 179 | # will... | ||
| 180 | if primitive_tokens: #if token[0] not in whitespaceset: | ||
| 181 | tokens = [token] | ||
| 182 | primitive.coalesce(tokens) | ||
| 183 | return PdfObject(''.join(tokens)) | ||
| 184 | |||
| 185 | def name_string(token): | ||
| 186 | tokens = [token] | ||
| 187 | primitive.coalesce(tokens) | ||
| 188 | token = ''.join(tokens) | ||
| 189 | if '#' in token: | ||
| 190 | substrs = token.split('#') | ||
| 191 | substrs.reverse() | ||
| 192 | tokens = [substrs.pop()] | ||
| 193 | while substrs: | ||
| 194 | s = substrs.pop() | ||
| 195 | tokens.append(chr(int(s[:2], 16))) | ||
| 196 | tokens.append(s[2:]) | ||
| 197 | token = ''.join(tokens) | ||
| 198 | return PdfObject(token) | ||
| 199 | |||
| 200 | def broken(token): | ||
| 201 | assert 0, token | ||
| 202 | |||
| 203 | dispatch = { | ||
| 204 | '(': regular_string, | ||
| 205 | ')': broken, | ||
| 206 | '<': hex_string, | ||
| 207 | '>': broken, | ||
| 208 | '[': single, | ||
| 209 | ']': single, | ||
| 210 | '{': single, | ||
| 211 | '}': single, | ||
| 212 | '/': name_string, | ||
| 213 | '%' : comment, | ||
| 214 | '<<': single, | ||
| 215 | '>>': single, | ||
| 216 | }.get | ||
| 217 | |||
| 218 | class MyIterator(object): | ||
| 219 | def next(): | ||
| 220 | while not tokens: | ||
| 221 | token = primitive_next() | ||
| 222 | token = dispatch(token, normal_data)(token) | ||
| 223 | if token: | ||
| 224 | return token | ||
| 225 | return tokens.pop() | ||
| 226 | next = staticmethod(next) | ||
| 227 | |||
| 228 | self.primitive = primitive = _PrimitiveTokens(fdata) | ||
| 229 | self.setstart = primitive.setstart | ||
| 230 | primitive.setstart(startloc) | ||
| 231 | self.fdata = fdata | ||
| 232 | self.strip_comments = strip_comments | ||
| 233 | self.tokens = tokens = [] | ||
| 234 | self.iterator = iterator = MyIterator() | ||
| 235 | self.next = iterator.next | ||
| 236 | primitive_next = primitive.next | ||
| 237 | primitive_tokens = primitive.tokens | ||
| 238 | whitespaceset = _PrimitiveTokens.whitespaceset | ||
| 239 | |||
| 240 | def floc(self): | ||
| 241 | return self.primitive.floc() - sum([len(x) for x in self.tokens]) | ||
| 242 | floc = property(floc) | ||
| 243 | |||
| 244 | def __iter__(self): | ||
| 245 | return self.iterator | ||
| 246 | |||
| 247 | def multiple(self, count): | ||
| 248 | next = self.next | ||
| 249 | return [next() for i in range(count)] | ||
diff --git a/lib/pdfrw/pdfwriter.py b/lib/pdfrw/pdfwriter.py deleted file mode 100644 index c193843..0000000 --- a/lib/pdfrw/pdfwriter.py +++ /dev/null | |||
| @@ -1,234 +0,0 @@ | |||
| 1 | #!/usr/bin/env python | ||
| 2 | |||
| 3 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 4 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 5 | # MIT license -- See LICENSE.txt for details | ||
| 6 | |||
| 7 | ''' | ||
| 8 | The PdfWriter class writes an entire PDF file out to disk. | ||
| 9 | |||
| 10 | The writing process is not at all optimized or organized. | ||
| 11 | |||
| 12 | An instance of the PdfWriter class has two methods: | ||
| 13 | addpage(page) | ||
| 14 | and | ||
| 15 | write(fname) | ||
| 16 | |||
| 17 | addpage() assumes that the pages are part of a valid | ||
| 18 | tree/forest of PDF objects. | ||
| 19 | ''' | ||
| 20 | |||
| 21 | try: | ||
| 22 | set | ||
| 23 | except NameError: | ||
| 24 | from sets import Set as set | ||
| 25 | |||
| 26 | from pdfobjects import PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfString | ||
| 27 | from pdfcompress import compress | ||
| 28 | |||
| 29 | debug = False | ||
| 30 | |||
| 31 | class FormatObjects(object): | ||
| 32 | ''' FormatObjects performs the actual formatting and disk write. | ||
| 33 | ''' | ||
| 34 | |||
| 35 | def add(self, obj, visited): | ||
| 36 | ''' Add an object to our list, if it's an indirect | ||
| 37 | object. Just format it if not. | ||
| 38 | ''' | ||
| 39 | # Can't hash dicts, so just hash the object ID | ||
| 40 | objid = id(obj) | ||
| 41 | |||
| 42 | # Automatically set stream objects to indirect | ||
| 43 | if isinstance(obj, PdfDict): | ||
| 44 | indirect = obj.indirect or (obj.stream is not None) | ||
| 45 | else: | ||
| 46 | indirect = getattr(obj, 'indirect', False) | ||
| 47 | |||
| 48 | if not indirect: | ||
| 49 | assert objid not in visited, \ | ||
| 50 | 'Circular reference encountered in non-indirect object %s' % repr(obj) | ||
| 51 | visited.add(objid) | ||
| 52 | result = self.format_obj(obj, visited) | ||
| 53 | visited.remove(objid) | ||
| 54 | return result | ||
| 55 | |||
| 56 | objnum = self.indirect_dict.get(objid) | ||
| 57 | |||
| 58 | # If we haven't seen the object yet, we need to | ||
| 59 | # add it to the indirect object list. | ||
| 60 | if objnum is None: | ||
| 61 | objlist = self.objlist | ||
| 62 | objnum = len(objlist) + 1 | ||
| 63 | if debug: | ||
| 64 | print ' Object', objnum, '\r', | ||
| 65 | objlist.append(None) | ||
| 66 | self.indirect_dict[objid] = objnum | ||
| 67 | objlist[objnum-1] = self.format_obj(obj) | ||
| 68 | return '%s 0 R' % objnum | ||
| 69 | |||
| 70 | def format_array(myarray, formatter): | ||
| 71 | # Format array data into semi-readable ASCII | ||
| 72 | if sum([len(x) for x in myarray]) <= 70: | ||
| 73 | return formatter % ' '.join(myarray) | ||
| 74 | bigarray = [] | ||
| 75 | count = 1000000 | ||
| 76 | for x in myarray: | ||
| 77 | lenx = len(x) | ||
| 78 | if lenx + count > 70: | ||
| 79 | subarray = [] | ||
| 80 | bigarray.append(subarray) | ||
| 81 | count = 0 | ||
| 82 | count += lenx + 1 | ||
| 83 | subarray.append(x) | ||
| 84 | return formatter % '\n '.join([' '.join(x) for x in bigarray]) | ||
| 85 | format_array = staticmethod(format_array) | ||
| 86 | |||
| 87 | def format_obj(self, obj, visited=None): | ||
| 88 | ''' format PDF object data into semi-readable ASCII. | ||
| 89 | May mutually recurse with add() -- add() will | ||
| 90 | return references for indirect objects, and add | ||
| 91 | the indirect object to the list. | ||
| 92 | ''' | ||
| 93 | if visited is None: | ||
| 94 | visited = set() | ||
| 95 | if isinstance(obj, PdfArray): | ||
| 96 | myarray = [self.add(x, visited) for x in obj] | ||
| 97 | return self.format_array(myarray, '[%s]') | ||
| 98 | elif isinstance(obj, PdfDict): | ||
| 99 | if self.compress and obj.stream: | ||
| 100 | compress([obj]) | ||
| 101 | myarray = [] | ||
| 102 | # Jython 2.2.1 has a bug which segfaults when | ||
| 103 | # sorting subclassed strings, so we un-subclass them. | ||
| 104 | dictkeys = [str(x) for x in obj.iterkeys()] | ||
| 105 | dictkeys.sort() | ||
| 106 | for key in dictkeys: | ||
| 107 | myarray.append(key) | ||
| 108 | myarray.append(self.add(obj[key], visited)) | ||
| 109 | result = self.format_array(myarray, '<<%s>>') | ||
| 110 | stream = obj.stream | ||
| 111 | if stream is not None: | ||
| 112 | result = '%s\nstream\n%s\nendstream' % (result, stream) | ||
| 113 | return result | ||
| 114 | elif isinstance(obj, basestring) and not hasattr(obj, 'indirect'): | ||
| 115 | return PdfString.encode(obj) | ||
| 116 | else: | ||
| 117 | return str(obj) | ||
| 118 | |||
| 119 | def dump(cls, f, trailer, version='1.3', compress=True): | ||
| 120 | self = cls() | ||
| 121 | self.compress = compress | ||
| 122 | self.indirect_dict = {} | ||
| 123 | self.objlist = [] | ||
| 124 | |||
| 125 | # The first format of trailer gets all the information, | ||
| 126 | # but we throw away the actual trailer formatting. | ||
| 127 | self.format_obj(trailer) | ||
| 128 | # Now we know the size, so we update the trailer dict | ||
| 129 | # and get the formatted data. | ||
| 130 | trailer.Size = PdfObject(len(self.objlist) + 1) | ||
| 131 | trailer = self.format_obj(trailer) | ||
| 132 | |||
| 133 | # Now we have all the pieces to write out to the file. | ||
| 134 | # Keep careful track of the counts while we do it so | ||
| 135 | # we can correctly build the cross-reference. | ||
| 136 | |||
| 137 | header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version | ||
| 138 | f.write(header) | ||
| 139 | offset = len(header) | ||
| 140 | offsets = [(0, 65535, 'f')] | ||
| 141 | |||
| 142 | for i, x in enumerate(self.objlist): | ||
| 143 | objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x) | ||
| 144 | offsets.append((offset, 0, 'n')) | ||
| 145 | offset += len(objstr) | ||
| 146 | f.write(objstr) | ||
| 147 | |||
| 148 | f.write('xref\n0 %s\n' % len(offsets)) | ||
| 149 | for x in offsets: | ||
| 150 | f.write('%010d %05d %s\r\n' % x) | ||
| 151 | f.write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset)) | ||
| 152 | dump = classmethod(dump) | ||
| 153 | |||
| 154 | class PdfWriter(object): | ||
| 155 | |||
| 156 | _trailer = None | ||
| 157 | |||
| 158 | def __init__(self, version='1.3', compress=True): | ||
| 159 | self.pagearray = PdfArray() | ||
| 160 | self.compress = compress | ||
| 161 | self.version = version | ||
| 162 | |||
| 163 | def addpage(self, page): | ||
| 164 | self._trailer = None | ||
| 165 | assert page.Type == PdfName.Page | ||
| 166 | inheritable = page.inheritable # searches for resources | ||
| 167 | self.pagearray.append( | ||
| 168 | IndirectPdfDict( | ||
| 169 | page, | ||
| 170 | Resources = inheritable.Resources, | ||
| 171 | MediaBox = inheritable.MediaBox, | ||
| 172 | CropBox = inheritable.CropBox, | ||
| 173 | Rotate = inheritable.Rotate, | ||
| 174 | ) | ||
| 175 | ) | ||
| 176 | return self | ||
| 177 | |||
| 178 | addPage = addpage # for compatibility with pyPdf | ||
| 179 | |||
| 180 | def addpages(self, pagelist): | ||
| 181 | for page in pagelist: | ||
| 182 | self.addpage(page) | ||
| 183 | return self | ||
| 184 | |||
| 185 | def _get_trailer(self): | ||
| 186 | trailer = self._trailer | ||
| 187 | if trailer is not None: | ||
| 188 | return trailer | ||
| 189 | |||
| 190 | # Create the basic object structure of the PDF file | ||
| 191 | trailer = PdfDict( | ||
| 192 | Root = IndirectPdfDict( | ||
| 193 | Type = PdfName.Catalog, | ||
| 194 | Pages = IndirectPdfDict( | ||
| 195 | Type = PdfName.Pages, | ||
| 196 | Count = PdfObject(len(self.pagearray)), | ||
| 197 | Kids = self.pagearray | ||
| 198 | ) | ||
| 199 | ) | ||
| 200 | ) | ||
| 201 | # Make all the pages point back to the page dictionary | ||
| 202 | pagedict = trailer.Root.Pages | ||
| 203 | for page in pagedict.Kids: | ||
| 204 | page.Parent = pagedict | ||
| 205 | self._trailer = trailer | ||
| 206 | return trailer | ||
| 207 | |||
| 208 | def _set_trailer(self, trailer): | ||
| 209 | self._trailer = trailer | ||
| 210 | |||
| 211 | trailer = property(_get_trailer, _set_trailer) | ||
| 212 | |||
| 213 | def write(self, fname, trailer=None): | ||
| 214 | trailer = trailer or self.trailer | ||
| 215 | |||
| 216 | # Dump the data. We either have a filename or a preexisting | ||
| 217 | # file object. | ||
| 218 | preexisting = hasattr(fname, 'write') | ||
| 219 | f = preexisting and fname or open(fname, 'wb') | ||
| 220 | FormatObjects.dump(f, trailer, self.version, self.compress) | ||
| 221 | if not preexisting: | ||
| 222 | f.close() | ||
| 223 | |||
| 224 | if __name__ == '__main__': | ||
| 225 | debug = True | ||
| 226 | import pdfreader | ||
| 227 | x = pdfreader.PdfReader('source.pdf') | ||
| 228 | y = PdfWriter() | ||
| 229 | for i, page in enumerate(x.pages): | ||
| 230 | print ' Adding page', i+1, '\r', | ||
| 231 | y.addpage(page) | ||
| 232 | |||
| 233 | y.write('result.pdf') | ||
| 234 | |||
