diff options
| author | jvoisin | 2011-08-16 18:11:24 +0200 |
|---|---|---|
| committer | jvoisin | 2011-08-16 18:11:24 +0200 |
| commit | 4bd3e47da02fde08acfada1795cc55170abdb00a (patch) | |
| tree | f8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /lib/pdfrw/pdfreader.py | |
| parent | baf8e080125614326ba9c96ca8f2404fd12b050e (diff) | |
setup.py now works !
Diffstat (limited to 'lib/pdfrw/pdfreader.py')
| -rw-r--r-- | lib/pdfrw/pdfreader.py | 213 |
1 files changed, 0 insertions, 213 deletions
diff --git a/lib/pdfrw/pdfreader.py b/lib/pdfrw/pdfreader.py deleted file mode 100644 index 6f57bea..0000000 --- a/lib/pdfrw/pdfreader.py +++ /dev/null | |||
| @@ -1,213 +0,0 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | The PdfReader class reads an entire PDF file into memory and | ||
| 7 | parses the top-level container objects. (It does not parse | ||
| 8 | into streams.) The object subclasses PdfDict, and the | ||
| 9 | document pages are stored in a list in the pages attribute | ||
| 10 | of the object. | ||
| 11 | ''' | ||
| 12 | |||
| 13 | from pdftokens import PdfTokens | ||
| 14 | from pdfobjects import PdfDict, PdfArray, PdfName | ||
| 15 | from pdfcompress import uncompress | ||
| 16 | |||
| 17 | class PdfReader(PdfDict): | ||
| 18 | |||
| 19 | class unresolved: | ||
| 20 | # Used as a placeholder until we have an object. | ||
| 21 | pass | ||
| 22 | |||
| 23 | def readindirect(self, objnum, gennum): | ||
| 24 | ''' Read an indirect object. If it has already | ||
| 25 | been read, return it from the cache. | ||
| 26 | ''' | ||
| 27 | |||
| 28 | def setobj(obj): | ||
| 29 | # Store the new object in the dictionary | ||
| 30 | # once we have its value | ||
| 31 | record[1] = obj | ||
| 32 | |||
| 33 | def ordinary(source, setobj, obj): | ||
| 34 | # Deal with an ordinary (non-array, non-dict) object | ||
| 35 | setobj(obj) | ||
| 36 | return obj | ||
| 37 | |||
| 38 | fdata, objnum, gennum = self.fdata, int(objnum), int(gennum) | ||
| 39 | record = self.indirect_objects[fdata, objnum, gennum] | ||
| 40 | if record[1] is not self.unresolved: | ||
| 41 | return record[1] | ||
| 42 | |||
| 43 | # Read the object header and validate it | ||
| 44 | source = PdfTokens(fdata, record[0]) | ||
| 45 | objid = source.multiple(3) | ||
| 46 | assert int(objid[0]) == objnum, objid | ||
| 47 | assert int(objid[1]) == gennum, objid | ||
| 48 | assert objid[2] == 'obj', objid | ||
| 49 | |||
| 50 | # Read the object, and call special code if it starts | ||
| 51 | # an array or dictionary | ||
| 52 | obj = source.next() | ||
| 53 | obj = self.special.get(obj, ordinary)(source, setobj, obj) | ||
| 54 | self.readstream(obj, source) | ||
| 55 | obj.indirect = True | ||
| 56 | return obj | ||
| 57 | |||
| 58 | def readstream(obj, source): | ||
| 59 | ''' Read optional stream following a dictionary | ||
| 60 | object. | ||
| 61 | ''' | ||
| 62 | tok = source.next() | ||
| 63 | if tok == 'endobj': | ||
| 64 | return # No stream | ||
| 65 | |||
| 66 | assert isinstance(obj, PdfDict) | ||
| 67 | assert tok == 'stream', tok | ||
| 68 | fdata = source.fdata | ||
| 69 | floc = fdata.rindex(tok, 0, source.floc) + len(tok) | ||
| 70 | ch = fdata[floc] | ||
| 71 | if ch == '\r': | ||
| 72 | floc += 1 | ||
| 73 | ch = fdata[floc] | ||
| 74 | assert ch == '\n' | ||
| 75 | startstream = floc + 1 | ||
| 76 | endstream = startstream + int(obj.Length) | ||
| 77 | obj._stream = fdata[startstream:endstream] | ||
| 78 | source = PdfTokens(fdata, endstream) | ||
| 79 | endit = source.multiple(2) | ||
| 80 | if endit != 'endstream endobj'.split(): | ||
| 81 | # /Length attribute is broken, try to read stream | ||
| 82 | # anyway disregarding the specified value | ||
| 83 | # TODO: issue warning here once we have some kind of | ||
| 84 | # logging | ||
| 85 | endstream = fdata.index('endstream', startstream) | ||
| 86 | if fdata[endstream-2:endstream] == '\r\n': | ||
| 87 | endstream -= 2 | ||
| 88 | elif fdata[endstream-1] in ['\n', '\r']: | ||
| 89 | endstream -= 1 | ||
| 90 | source = PdfTokens(fdata, endstream) | ||
| 91 | endit = source.multiple(2) | ||
| 92 | assert endit == 'endstream endobj'.split() | ||
| 93 | obj.Length = str(endstream-startstream) | ||
| 94 | obj._stream = fdata[startstream:endstream] | ||
| 95 | readstream = staticmethod(readstream) | ||
| 96 | |||
| 97 | def readarray(self, source, setobj=lambda x:None, original=None): | ||
| 98 | special = self.special | ||
| 99 | result = PdfArray() | ||
| 100 | setobj(result) | ||
| 101 | |||
| 102 | for value in source: | ||
| 103 | if value == ']': | ||
| 104 | break | ||
| 105 | if value in special: | ||
| 106 | value = special[value](source) | ||
| 107 | elif value == 'R': | ||
| 108 | generation = result.pop() | ||
| 109 | value = self.readindirect(result.pop(), generation) | ||
| 110 | result.append(value) | ||
| 111 | return result | ||
| 112 | |||
| 113 | def readdict(self, source, setobj=lambda x:None, original=None): | ||
| 114 | special = self.special | ||
| 115 | result = PdfDict() | ||
| 116 | setobj(result) | ||
| 117 | |||
| 118 | tok = source.next() | ||
| 119 | while tok != '>>': | ||
| 120 | assert tok.startswith('/'), (tok, source.multiple(10)) | ||
| 121 | key = tok | ||
| 122 | value = source.next() | ||
| 123 | if value in special: | ||
| 124 | value = special[value](source) | ||
| 125 | tok = source.next() | ||
| 126 | else: | ||
| 127 | tok = source.next() | ||
| 128 | if value.isdigit() and tok.isdigit(): | ||
| 129 | assert source.next() == 'R' | ||
| 130 | value = self.readindirect(value, tok) | ||
| 131 | tok = source.next() | ||
| 132 | result[key] = value | ||
| 133 | |||
| 134 | return result | ||
| 135 | |||
| 136 | def readxref(fdata): | ||
| 137 | startloc = fdata.rindex('startxref') | ||
| 138 | xrefinfo = list(PdfTokens(fdata, startloc, False)) | ||
| 139 | assert len(xrefinfo) == 3, xrefinfo | ||
| 140 | assert xrefinfo[0] == 'startxref', xrefinfo[0] | ||
| 141 | assert xrefinfo[1].isdigit(), xrefinfo[1] | ||
| 142 | assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2]) | ||
| 143 | return startloc, PdfTokens(fdata, int(xrefinfo[1])) | ||
| 144 | readxref = staticmethod(readxref) | ||
| 145 | |||
| 146 | def parsexref(self, source): | ||
| 147 | tok = source.next() | ||
| 148 | assert tok == 'xref', tok | ||
| 149 | while 1: | ||
| 150 | tok = source.next() | ||
| 151 | if tok == 'trailer': | ||
| 152 | break | ||
| 153 | startobj = int(tok) | ||
| 154 | for objnum in range(startobj, startobj + int(source.next())): | ||
| 155 | offset = int(source.next()) | ||
| 156 | generation = int(source.next()) | ||
| 157 | if source.next() == 'n': | ||
| 158 | objid = self.fdata, objnum, generation | ||
| 159 | objval = [offset, self.unresolved] | ||
| 160 | self.indirect_objects.setdefault(objid, objval) | ||
| 161 | |||
| 162 | pagename = PdfName.Page | ||
| 163 | pagesname = PdfName.Pages | ||
| 164 | |||
| 165 | def readpages(self, node): | ||
| 166 | # PDFs can have arbitrarily nested Pages/Page | ||
| 167 | # dictionary structures. | ||
| 168 | if node.Type == self.pagename: | ||
| 169 | return [node] | ||
| 170 | assert node.Type == self.pagesname, node.Type | ||
| 171 | result = [] | ||
| 172 | for node in node.Kids: | ||
| 173 | result.extend(self.readpages(node)) | ||
| 174 | return result | ||
| 175 | |||
| 176 | def __init__(self, fname=None, fdata=None, decompress=True): | ||
| 177 | |||
| 178 | if fname is not None: | ||
| 179 | assert fdata is None | ||
| 180 | # Allow reading preexisting streams like pyPdf | ||
| 181 | if hasattr(fname, 'read'): | ||
| 182 | fdata = fname.read() | ||
| 183 | else: | ||
| 184 | f = open(fname, 'rb') | ||
| 185 | fdata = f.read() | ||
| 186 | f.close() | ||
| 187 | |||
| 188 | assert fdata is not None | ||
| 189 | fdata = fdata.rstrip('\00') | ||
| 190 | self.private.fdata = fdata | ||
| 191 | |||
| 192 | self.private.indirect_objects = {} | ||
| 193 | self.private.special = {'<<': self.readdict, '[': self.readarray} | ||
| 194 | |||
| 195 | startloc, source = self.readxref(fdata) | ||
| 196 | self.parsexref(source) | ||
| 197 | assert source.next() == '<<' | ||
| 198 | self.update(self.readdict(source)) | ||
| 199 | assert source.next() == 'startxref' and source.floc > startloc | ||
| 200 | self.private.pages = self.readpages(self.Root.Pages) | ||
| 201 | if decompress: | ||
| 202 | self.uncompress() | ||
| 203 | |||
| 204 | # For compatibility with pyPdf | ||
| 205 | self.private.numPages = len(self.pages) | ||
| 206 | |||
| 207 | |||
| 208 | # For compatibility with pyPdf | ||
| 209 | def getPage(self, pagenum): | ||
| 210 | return self.pages[pagenum] | ||
| 211 | |||
| 212 | def uncompress(self): | ||
| 213 | uncompress([x[1] for x in self.indirect_objects.itervalues()]) | ||
