summaryrefslogtreecommitdiff
path: root/lib/pdfrw/pdftokens.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pdfrw/pdftokens.py')
-rw-r--r--lib/pdfrw/pdftokens.py249
1 files changed, 0 insertions, 249 deletions
diff --git a/lib/pdfrw/pdftokens.py b/lib/pdfrw/pdftokens.py
deleted file mode 100644
index 04bd559..0000000
--- a/lib/pdfrw/pdftokens.py
+++ /dev/null
@@ -1,249 +0,0 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6A tokenizer for PDF streams.
7
8In general, documentation used was "PDF reference",
9sixth edition, for PDF version 1.7, dated November 2006.
10
11'''
12
13from __future__ import generators
14
15try:
16 set
17except NameError:
18 from sets import Set as set
19
20import re
21from pdfobjects import PdfString, PdfObject
22
23class _PrimitiveTokens(object):
24
25 # Table 3.1, page 50 of reference, defines whitespace
26 whitespaceset = set('\x00\t\n\f\r ')
27
28
29 # Text on page 50 defines delimiter characters
30 delimiterset = set('()<>{}[]/%')
31
32 # Coalesce contiguous whitespace into a single token
33 whitespace_pattern = '[%s]+' % ''.join(whitespaceset)
34
35 # In addition to the delimiters, we also use '\', which
36 # is special in some contexts in PDF.
37 delimiter_pattern = '\\\\|\\' + '|\\'.join(delimiterset)
38
39 # Dictionary delimiters are '<<' and '>>'. Look for
40 # these before the single variety.
41 dictdelim_pattern = r'\<\<|\>\>'
42
43 pattern = '(%s|%s|%s)' % (whitespace_pattern,
44 dictdelim_pattern, delimiter_pattern)
45 re_func = re.compile(pattern).finditer
46 del whitespace_pattern, dictdelim_pattern
47 del delimiter_pattern, pattern
48
49 def __init__(self, fdata):
50
51 class MyIterator(object):
52 def next():
53 if not tokens:
54 startloc = self.startloc
55 for match in next_match[0]:
56 start = match.start()
57 end = match.end()
58 tappend(fdata[start:end])
59 if start > startloc:
60 tappend(fdata[startloc:start])
61 self.startloc = end
62 break
63 else:
64 s = fdata[startloc:]
65 self.startloc = len(fdata)
66 if s:
67 tappend(s)
68 if not tokens:
69 raise StopIteration
70 return tpop()
71 next = staticmethod(next)
72
73 self.fdata = fdata
74 self.tokens = tokens = []
75 self.iterator = iterator = MyIterator()
76 self.next = iterator.next
77 self.next_match = next_match = [None]
78 tappend = tokens.append
79 tpop = tokens.pop
80
81 def setstart(self, startloc):
82 self.startloc = startloc
83 self.next_match[0] = self.re_func(self.fdata, startloc)
84
85 def __iter__(self):
86 return self.iterator
87
88 def coalesce(self, result):
89 ''' This function coalesces tokens together up until
90 the next delimiter or whitespace.
91 All of the coalesced tokens will either be non-matches,
92 or will be a matched backslash. We distinguish the
93 non-matches by the fact that next() will have left
94 a following match inside self.tokens for the actual match.
95 '''
96 tokens = self.tokens
97 whitespace = self.whitespaceset
98
99 # Optimized path for usual case -- regular data (not a name string),
100 # with no escape character, and followed by whitespace.
101
102 if tokens:
103 token = tokens.pop()
104 if token != '\\':
105 if token[0] not in whitespace:
106 tokens.append(token)
107 return
108 result.append(token)
109
110 # Non-optimized path. Either start of a name string received,
111 # or we just had one escape.
112
113 for token in self:
114 if tokens:
115 result.append(token)
116 token = tokens.pop()
117 if token != '\\':
118 if token[0] not in whitespace:
119 tokens.append(token)
120 return
121 result.append(token)
122
123
124 def floc(self):
125 return self.startloc - sum([len(x) for x in self.tokens])
126
127class PdfTokens(object):
128
129 def __init__(self, fdata, startloc=0, strip_comments=True):
130
131 def comment(token):
132 tokens = [token]
133 for token in primitive:
134 tokens.append(token)
135 if token[0] in whitespaceset and ('\n' in token or '\r' in token):
136 break
137 return not strip_comments and ''.join(tokens)
138
139 def single(token):
140 return token
141
142 def regular_string(token):
143 def escaped():
144 escaped = False
145 i = -2
146 while tokens[i] == '\\':
147 escaped = not escaped
148 i -= 1
149 return escaped
150
151 tokens = [token]
152 nestlevel = 1
153 for token in primitive:
154 tokens.append(token)
155 if token in '()' and not escaped():
156 nestlevel += token == '(' or -1
157 if not nestlevel:
158 break
159 else:
160 assert 0, "Unexpected end of token stream"
161 return PdfString(''.join(tokens))
162
163 def hex_string(token):
164 tokens = [token]
165 for token in primitive:
166 tokens.append(token)
167 if token == '>':
168 break
169 while tokens[-2] == '>>':
170 tokens.append(tokens.pop(-2))
171 return PdfString(''.join(tokens))
172
173 def normal_data(token):
174
175 # Obscure optimization -- we can get here with
176 # whitespace or regular character data. If we get
177 # here with whitespace, then there won't be an additional
178 # token queued up in the primitive object, otherwise there
179 # will...
180 if primitive_tokens: #if token[0] not in whitespaceset:
181 tokens = [token]
182 primitive.coalesce(tokens)
183 return PdfObject(''.join(tokens))
184
185 def name_string(token):
186 tokens = [token]
187 primitive.coalesce(tokens)
188 token = ''.join(tokens)
189 if '#' in token:
190 substrs = token.split('#')
191 substrs.reverse()
192 tokens = [substrs.pop()]
193 while substrs:
194 s = substrs.pop()
195 tokens.append(chr(int(s[:2], 16)))
196 tokens.append(s[2:])
197 token = ''.join(tokens)
198 return PdfObject(token)
199
200 def broken(token):
201 assert 0, token
202
203 dispatch = {
204 '(': regular_string,
205 ')': broken,
206 '<': hex_string,
207 '>': broken,
208 '[': single,
209 ']': single,
210 '{': single,
211 '}': single,
212 '/': name_string,
213 '%' : comment,
214 '<<': single,
215 '>>': single,
216 }.get
217
218 class MyIterator(object):
219 def next():
220 while not tokens:
221 token = primitive_next()
222 token = dispatch(token, normal_data)(token)
223 if token:
224 return token
225 return tokens.pop()
226 next = staticmethod(next)
227
228 self.primitive = primitive = _PrimitiveTokens(fdata)
229 self.setstart = primitive.setstart
230 primitive.setstart(startloc)
231 self.fdata = fdata
232 self.strip_comments = strip_comments
233 self.tokens = tokens = []
234 self.iterator = iterator = MyIterator()
235 self.next = iterator.next
236 primitive_next = primitive.next
237 primitive_tokens = primitive.tokens
238 whitespaceset = _PrimitiveTokens.whitespaceset
239
240 def floc(self):
241 return self.primitive.floc() - sum([len(x) for x in self.tokens])
242 floc = property(floc)
243
244 def __iter__(self):
245 return self.iterator
246
247 def multiple(self, count):
248 next = self.next
249 return [next() for i in range(count)]