diff options
Diffstat (limited to 'lib/pdfrw/pdftokens.py')
| -rw-r--r-- | lib/pdfrw/pdftokens.py | 249 |
1 files changed, 0 insertions, 249 deletions
diff --git a/lib/pdfrw/pdftokens.py b/lib/pdfrw/pdftokens.py deleted file mode 100644 index 04bd559..0000000 --- a/lib/pdfrw/pdftokens.py +++ /dev/null | |||
| @@ -1,249 +0,0 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | A tokenizer for PDF streams. | ||
| 7 | |||
| 8 | In general, documentation used was "PDF reference", | ||
| 9 | sixth edition, for PDF version 1.7, dated November 2006. | ||
| 10 | |||
| 11 | ''' | ||
| 12 | |||
| 13 | from __future__ import generators | ||
| 14 | |||
| 15 | try: | ||
| 16 | set | ||
| 17 | except NameError: | ||
| 18 | from sets import Set as set | ||
| 19 | |||
| 20 | import re | ||
| 21 | from pdfobjects import PdfString, PdfObject | ||
| 22 | |||
| 23 | class _PrimitiveTokens(object): | ||
| 24 | |||
| 25 | # Table 3.1, page 50 of reference, defines whitespace | ||
| 26 | whitespaceset = set('\x00\t\n\f\r ') | ||
| 27 | |||
| 28 | |||
| 29 | # Text on page 50 defines delimiter characters | ||
| 30 | delimiterset = set('()<>{}[]/%') | ||
| 31 | |||
| 32 | # Coalesce contiguous whitespace into a single token | ||
| 33 | whitespace_pattern = '[%s]+' % ''.join(whitespaceset) | ||
| 34 | |||
| 35 | # In addition to the delimiters, we also use '\', which | ||
| 36 | # is special in some contexts in PDF. | ||
| 37 | delimiter_pattern = '\\\\|\\' + '|\\'.join(delimiterset) | ||
| 38 | |||
| 39 | # Dictionary delimiters are '<<' and '>>'. Look for | ||
| 40 | # these before the single variety. | ||
| 41 | dictdelim_pattern = r'\<\<|\>\>' | ||
| 42 | |||
| 43 | pattern = '(%s|%s|%s)' % (whitespace_pattern, | ||
| 44 | dictdelim_pattern, delimiter_pattern) | ||
| 45 | re_func = re.compile(pattern).finditer | ||
| 46 | del whitespace_pattern, dictdelim_pattern | ||
| 47 | del delimiter_pattern, pattern | ||
| 48 | |||
| 49 | def __init__(self, fdata): | ||
| 50 | |||
| 51 | class MyIterator(object): | ||
| 52 | def next(): | ||
| 53 | if not tokens: | ||
| 54 | startloc = self.startloc | ||
| 55 | for match in next_match[0]: | ||
| 56 | start = match.start() | ||
| 57 | end = match.end() | ||
| 58 | tappend(fdata[start:end]) | ||
| 59 | if start > startloc: | ||
| 60 | tappend(fdata[startloc:start]) | ||
| 61 | self.startloc = end | ||
| 62 | break | ||
| 63 | else: | ||
| 64 | s = fdata[startloc:] | ||
| 65 | self.startloc = len(fdata) | ||
| 66 | if s: | ||
| 67 | tappend(s) | ||
| 68 | if not tokens: | ||
| 69 | raise StopIteration | ||
| 70 | return tpop() | ||
| 71 | next = staticmethod(next) | ||
| 72 | |||
| 73 | self.fdata = fdata | ||
| 74 | self.tokens = tokens = [] | ||
| 75 | self.iterator = iterator = MyIterator() | ||
| 76 | self.next = iterator.next | ||
| 77 | self.next_match = next_match = [None] | ||
| 78 | tappend = tokens.append | ||
| 79 | tpop = tokens.pop | ||
| 80 | |||
| 81 | def setstart(self, startloc): | ||
| 82 | self.startloc = startloc | ||
| 83 | self.next_match[0] = self.re_func(self.fdata, startloc) | ||
| 84 | |||
| 85 | def __iter__(self): | ||
| 86 | return self.iterator | ||
| 87 | |||
| 88 | def coalesce(self, result): | ||
| 89 | ''' This function coalesces tokens together up until | ||
| 90 | the next delimiter or whitespace. | ||
| 91 | All of the coalesced tokens will either be non-matches, | ||
| 92 | or will be a matched backslash. We distinguish the | ||
| 93 | non-matches by the fact that next() will have left | ||
| 94 | a following match inside self.tokens for the actual match. | ||
| 95 | ''' | ||
| 96 | tokens = self.tokens | ||
| 97 | whitespace = self.whitespaceset | ||
| 98 | |||
| 99 | # Optimized path for usual case -- regular data (not a name string), | ||
| 100 | # with no escape character, and followed by whitespace. | ||
| 101 | |||
| 102 | if tokens: | ||
| 103 | token = tokens.pop() | ||
| 104 | if token != '\\': | ||
| 105 | if token[0] not in whitespace: | ||
| 106 | tokens.append(token) | ||
| 107 | return | ||
| 108 | result.append(token) | ||
| 109 | |||
| 110 | # Non-optimized path. Either start of a name string received, | ||
| 111 | # or we just had one escape. | ||
| 112 | |||
| 113 | for token in self: | ||
| 114 | if tokens: | ||
| 115 | result.append(token) | ||
| 116 | token = tokens.pop() | ||
| 117 | if token != '\\': | ||
| 118 | if token[0] not in whitespace: | ||
| 119 | tokens.append(token) | ||
| 120 | return | ||
| 121 | result.append(token) | ||
| 122 | |||
| 123 | |||
| 124 | def floc(self): | ||
| 125 | return self.startloc - sum([len(x) for x in self.tokens]) | ||
| 126 | |||
| 127 | class PdfTokens(object): | ||
| 128 | |||
| 129 | def __init__(self, fdata, startloc=0, strip_comments=True): | ||
| 130 | |||
| 131 | def comment(token): | ||
| 132 | tokens = [token] | ||
| 133 | for token in primitive: | ||
| 134 | tokens.append(token) | ||
| 135 | if token[0] in whitespaceset and ('\n' in token or '\r' in token): | ||
| 136 | break | ||
| 137 | return not strip_comments and ''.join(tokens) | ||
| 138 | |||
| 139 | def single(token): | ||
| 140 | return token | ||
| 141 | |||
| 142 | def regular_string(token): | ||
| 143 | def escaped(): | ||
| 144 | escaped = False | ||
| 145 | i = -2 | ||
| 146 | while tokens[i] == '\\': | ||
| 147 | escaped = not escaped | ||
| 148 | i -= 1 | ||
| 149 | return escaped | ||
| 150 | |||
| 151 | tokens = [token] | ||
| 152 | nestlevel = 1 | ||
| 153 | for token in primitive: | ||
| 154 | tokens.append(token) | ||
| 155 | if token in '()' and not escaped(): | ||
| 156 | nestlevel += token == '(' or -1 | ||
| 157 | if not nestlevel: | ||
| 158 | break | ||
| 159 | else: | ||
| 160 | assert 0, "Unexpected end of token stream" | ||
| 161 | return PdfString(''.join(tokens)) | ||
| 162 | |||
| 163 | def hex_string(token): | ||
| 164 | tokens = [token] | ||
| 165 | for token in primitive: | ||
| 166 | tokens.append(token) | ||
| 167 | if token == '>': | ||
| 168 | break | ||
| 169 | while tokens[-2] == '>>': | ||
| 170 | tokens.append(tokens.pop(-2)) | ||
| 171 | return PdfString(''.join(tokens)) | ||
| 172 | |||
| 173 | def normal_data(token): | ||
| 174 | |||
| 175 | # Obscure optimization -- we can get here with | ||
| 176 | # whitespace or regular character data. If we get | ||
| 177 | # here with whitespace, then there won't be an additional | ||
| 178 | # token queued up in the primitive object, otherwise there | ||
| 179 | # will... | ||
| 180 | if primitive_tokens: #if token[0] not in whitespaceset: | ||
| 181 | tokens = [token] | ||
| 182 | primitive.coalesce(tokens) | ||
| 183 | return PdfObject(''.join(tokens)) | ||
| 184 | |||
| 185 | def name_string(token): | ||
| 186 | tokens = [token] | ||
| 187 | primitive.coalesce(tokens) | ||
| 188 | token = ''.join(tokens) | ||
| 189 | if '#' in token: | ||
| 190 | substrs = token.split('#') | ||
| 191 | substrs.reverse() | ||
| 192 | tokens = [substrs.pop()] | ||
| 193 | while substrs: | ||
| 194 | s = substrs.pop() | ||
| 195 | tokens.append(chr(int(s[:2], 16))) | ||
| 196 | tokens.append(s[2:]) | ||
| 197 | token = ''.join(tokens) | ||
| 198 | return PdfObject(token) | ||
| 199 | |||
| 200 | def broken(token): | ||
| 201 | assert 0, token | ||
| 202 | |||
| 203 | dispatch = { | ||
| 204 | '(': regular_string, | ||
| 205 | ')': broken, | ||
| 206 | '<': hex_string, | ||
| 207 | '>': broken, | ||
| 208 | '[': single, | ||
| 209 | ']': single, | ||
| 210 | '{': single, | ||
| 211 | '}': single, | ||
| 212 | '/': name_string, | ||
| 213 | '%' : comment, | ||
| 214 | '<<': single, | ||
| 215 | '>>': single, | ||
| 216 | }.get | ||
| 217 | |||
| 218 | class MyIterator(object): | ||
| 219 | def next(): | ||
| 220 | while not tokens: | ||
| 221 | token = primitive_next() | ||
| 222 | token = dispatch(token, normal_data)(token) | ||
| 223 | if token: | ||
| 224 | return token | ||
| 225 | return tokens.pop() | ||
| 226 | next = staticmethod(next) | ||
| 227 | |||
| 228 | self.primitive = primitive = _PrimitiveTokens(fdata) | ||
| 229 | self.setstart = primitive.setstart | ||
| 230 | primitive.setstart(startloc) | ||
| 231 | self.fdata = fdata | ||
| 232 | self.strip_comments = strip_comments | ||
| 233 | self.tokens = tokens = [] | ||
| 234 | self.iterator = iterator = MyIterator() | ||
| 235 | self.next = iterator.next | ||
| 236 | primitive_next = primitive.next | ||
| 237 | primitive_tokens = primitive.tokens | ||
| 238 | whitespaceset = _PrimitiveTokens.whitespaceset | ||
| 239 | |||
| 240 | def floc(self): | ||
| 241 | return self.primitive.floc() - sum([len(x) for x in self.tokens]) | ||
| 242 | floc = property(floc) | ||
| 243 | |||
| 244 | def __iter__(self): | ||
| 245 | return self.iterator | ||
| 246 | |||
| 247 | def multiple(self, count): | ||
| 248 | next = self.next | ||
| 249 | return [next() for i in range(count)] | ||
