summaryrefslogtreecommitdiff
path: root/lib/pdfrw/pdfreader.py
diff options
context:
space:
mode:
authorjvoisin2011-08-16 18:11:24 +0200
committerjvoisin2011-08-16 18:11:24 +0200
commit4bd3e47da02fde08acfada1795cc55170abdb00a (patch)
treef8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /lib/pdfrw/pdfreader.py
parentbaf8e080125614326ba9c96ca8f2404fd12b050e (diff)
setup.py now works !
Diffstat (limited to 'lib/pdfrw/pdfreader.py')
-rw-r--r--lib/pdfrw/pdfreader.py213
1 files changed, 0 insertions, 213 deletions
diff --git a/lib/pdfrw/pdfreader.py b/lib/pdfrw/pdfreader.py
deleted file mode 100644
index 6f57bea..0000000
--- a/lib/pdfrw/pdfreader.py
+++ /dev/null
@@ -1,213 +0,0 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6The PdfReader class reads an entire PDF file into memory and
7parses the top-level container objects. (It does not parse
8into streams.) The object subclasses PdfDict, and the
9document pages are stored in a list in the pages attribute
10of the object.
11'''
12
13from pdftokens import PdfTokens
14from pdfobjects import PdfDict, PdfArray, PdfName
15from pdfcompress import uncompress
16
17class PdfReader(PdfDict):
18
19 class unresolved:
20 # Used as a placeholder until we have an object.
21 pass
22
23 def readindirect(self, objnum, gennum):
24 ''' Read an indirect object. If it has already
25 been read, return it from the cache.
26 '''
27
28 def setobj(obj):
29 # Store the new object in the dictionary
30 # once we have its value
31 record[1] = obj
32
33 def ordinary(source, setobj, obj):
34 # Deal with an ordinary (non-array, non-dict) object
35 setobj(obj)
36 return obj
37
38 fdata, objnum, gennum = self.fdata, int(objnum), int(gennum)
39 record = self.indirect_objects[fdata, objnum, gennum]
40 if record[1] is not self.unresolved:
41 return record[1]
42
43 # Read the object header and validate it
44 source = PdfTokens(fdata, record[0])
45 objid = source.multiple(3)
46 assert int(objid[0]) == objnum, objid
47 assert int(objid[1]) == gennum, objid
48 assert objid[2] == 'obj', objid
49
50 # Read the object, and call special code if it starts
51 # an array or dictionary
52 obj = source.next()
53 obj = self.special.get(obj, ordinary)(source, setobj, obj)
54 self.readstream(obj, source)
55 obj.indirect = True
56 return obj
57
58 def readstream(obj, source):
59 ''' Read optional stream following a dictionary
60 object.
61 '''
62 tok = source.next()
63 if tok == 'endobj':
64 return # No stream
65
66 assert isinstance(obj, PdfDict)
67 assert tok == 'stream', tok
68 fdata = source.fdata
69 floc = fdata.rindex(tok, 0, source.floc) + len(tok)
70 ch = fdata[floc]
71 if ch == '\r':
72 floc += 1
73 ch = fdata[floc]
74 assert ch == '\n'
75 startstream = floc + 1
76 endstream = startstream + int(obj.Length)
77 obj._stream = fdata[startstream:endstream]
78 source = PdfTokens(fdata, endstream)
79 endit = source.multiple(2)
80 if endit != 'endstream endobj'.split():
81 # /Length attribute is broken, try to read stream
82 # anyway disregarding the specified value
83 # TODO: issue warning here once we have some kind of
84 # logging
85 endstream = fdata.index('endstream', startstream)
86 if fdata[endstream-2:endstream] == '\r\n':
87 endstream -= 2
88 elif fdata[endstream-1] in ['\n', '\r']:
89 endstream -= 1
90 source = PdfTokens(fdata, endstream)
91 endit = source.multiple(2)
92 assert endit == 'endstream endobj'.split()
93 obj.Length = str(endstream-startstream)
94 obj._stream = fdata[startstream:endstream]
95 readstream = staticmethod(readstream)
96
97 def readarray(self, source, setobj=lambda x:None, original=None):
98 special = self.special
99 result = PdfArray()
100 setobj(result)
101
102 for value in source:
103 if value == ']':
104 break
105 if value in special:
106 value = special[value](source)
107 elif value == 'R':
108 generation = result.pop()
109 value = self.readindirect(result.pop(), generation)
110 result.append(value)
111 return result
112
113 def readdict(self, source, setobj=lambda x:None, original=None):
114 special = self.special
115 result = PdfDict()
116 setobj(result)
117
118 tok = source.next()
119 while tok != '>>':
120 assert tok.startswith('/'), (tok, source.multiple(10))
121 key = tok
122 value = source.next()
123 if value in special:
124 value = special[value](source)
125 tok = source.next()
126 else:
127 tok = source.next()
128 if value.isdigit() and tok.isdigit():
129 assert source.next() == 'R'
130 value = self.readindirect(value, tok)
131 tok = source.next()
132 result[key] = value
133
134 return result
135
136 def readxref(fdata):
137 startloc = fdata.rindex('startxref')
138 xrefinfo = list(PdfTokens(fdata, startloc, False))
139 assert len(xrefinfo) == 3, xrefinfo
140 assert xrefinfo[0] == 'startxref', xrefinfo[0]
141 assert xrefinfo[1].isdigit(), xrefinfo[1]
142 assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2])
143 return startloc, PdfTokens(fdata, int(xrefinfo[1]))
144 readxref = staticmethod(readxref)
145
146 def parsexref(self, source):
147 tok = source.next()
148 assert tok == 'xref', tok
149 while 1:
150 tok = source.next()
151 if tok == 'trailer':
152 break
153 startobj = int(tok)
154 for objnum in range(startobj, startobj + int(source.next())):
155 offset = int(source.next())
156 generation = int(source.next())
157 if source.next() == 'n':
158 objid = self.fdata, objnum, generation
159 objval = [offset, self.unresolved]
160 self.indirect_objects.setdefault(objid, objval)
161
162 pagename = PdfName.Page
163 pagesname = PdfName.Pages
164
165 def readpages(self, node):
166 # PDFs can have arbitrarily nested Pages/Page
167 # dictionary structures.
168 if node.Type == self.pagename:
169 return [node]
170 assert node.Type == self.pagesname, node.Type
171 result = []
172 for node in node.Kids:
173 result.extend(self.readpages(node))
174 return result
175
176 def __init__(self, fname=None, fdata=None, decompress=True):
177
178 if fname is not None:
179 assert fdata is None
180 # Allow reading preexisting streams like pyPdf
181 if hasattr(fname, 'read'):
182 fdata = fname.read()
183 else:
184 f = open(fname, 'rb')
185 fdata = f.read()
186 f.close()
187
188 assert fdata is not None
189 fdata = fdata.rstrip('\00')
190 self.private.fdata = fdata
191
192 self.private.indirect_objects = {}
193 self.private.special = {'<<': self.readdict, '[': self.readarray}
194
195 startloc, source = self.readxref(fdata)
196 self.parsexref(source)
197 assert source.next() == '<<'
198 self.update(self.readdict(source))
199 assert source.next() == 'startxref' and source.floc > startloc
200 self.private.pages = self.readpages(self.Root.Pages)
201 if decompress:
202 self.uncompress()
203
204 # For compatibility with pyPdf
205 self.private.numPages = len(self.pages)
206
207
208 # For compatibility with pyPdf
209 def getPage(self, pagenum):
210 return self.pages[pagenum]
211
212 def uncompress(self):
213 uncompress([x[1] for x in self.indirect_objects.itervalues()])