From 4bd3e47da02fde08acfada1795cc55170abdb00a Mon Sep 17 00:00:00 2001 From: jvoisin Date: Tue, 16 Aug 2011 18:11:24 +0200 Subject: setup.py now works ! --- lib/pdfrw/pdftokens.py | 249 ------------------------------------------------- 1 file changed, 249 deletions(-) delete mode 100644 lib/pdfrw/pdftokens.py (limited to 'lib/pdfrw/pdftokens.py') diff --git a/lib/pdfrw/pdftokens.py b/lib/pdfrw/pdftokens.py deleted file mode 100644 index 04bd559..0000000 --- a/lib/pdfrw/pdftokens.py +++ /dev/null @@ -1,249 +0,0 @@ -# A part of pdfrw (pdfrw.googlecode.com) -# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas -# MIT license -- See LICENSE.txt for details - -''' -A tokenizer for PDF streams. - -In general, documentation used was "PDF reference", -sixth edition, for PDF version 1.7, dated November 2006. - -''' - -from __future__ import generators - -try: - set -except NameError: - from sets import Set as set - -import re -from pdfobjects import PdfString, PdfObject - -class _PrimitiveTokens(object): - - # Table 3.1, page 50 of reference, defines whitespace - whitespaceset = set('\x00\t\n\f\r ') - - - # Text on page 50 defines delimiter characters - delimiterset = set('()<>{}[]/%') - - # Coalesce contiguous whitespace into a single token - whitespace_pattern = '[%s]+' % ''.join(whitespaceset) - - # In addition to the delimiters, we also use '\', which - # is special in some contexts in PDF. - delimiter_pattern = '\\\\|\\' + '|\\'.join(delimiterset) - - # Dictionary delimiters are '<<' and '>>'. Look for - # these before the single variety. - dictdelim_pattern = r'\<\<|\>\>' - - pattern = '(%s|%s|%s)' % (whitespace_pattern, - dictdelim_pattern, delimiter_pattern) - re_func = re.compile(pattern).finditer - del whitespace_pattern, dictdelim_pattern - del delimiter_pattern, pattern - - def __init__(self, fdata): - - class MyIterator(object): - def next(): - if not tokens: - startloc = self.startloc - for match in next_match[0]: - start = match.start() - end = match.end() - tappend(fdata[start:end]) - if start > startloc: - tappend(fdata[startloc:start]) - self.startloc = end - break - else: - s = fdata[startloc:] - self.startloc = len(fdata) - if s: - tappend(s) - if not tokens: - raise StopIteration - return tpop() - next = staticmethod(next) - - self.fdata = fdata - self.tokens = tokens = [] - self.iterator = iterator = MyIterator() - self.next = iterator.next - self.next_match = next_match = [None] - tappend = tokens.append - tpop = tokens.pop - - def setstart(self, startloc): - self.startloc = startloc - self.next_match[0] = self.re_func(self.fdata, startloc) - - def __iter__(self): - return self.iterator - - def coalesce(self, result): - ''' This function coalesces tokens together up until - the next delimiter or whitespace. - All of the coalesced tokens will either be non-matches, - or will be a matched backslash. We distinguish the - non-matches by the fact that next() will have left - a following match inside self.tokens for the actual match. - ''' - tokens = self.tokens - whitespace = self.whitespaceset - - # Optimized path for usual case -- regular data (not a name string), - # with no escape character, and followed by whitespace. - - if tokens: - token = tokens.pop() - if token != '\\': - if token[0] not in whitespace: - tokens.append(token) - return - result.append(token) - - # Non-optimized path. Either start of a name string received, - # or we just had one escape. - - for token in self: - if tokens: - result.append(token) - token = tokens.pop() - if token != '\\': - if token[0] not in whitespace: - tokens.append(token) - return - result.append(token) - - - def floc(self): - return self.startloc - sum([len(x) for x in self.tokens]) - -class PdfTokens(object): - - def __init__(self, fdata, startloc=0, strip_comments=True): - - def comment(token): - tokens = [token] - for token in primitive: - tokens.append(token) - if token[0] in whitespaceset and ('\n' in token or '\r' in token): - break - return not strip_comments and ''.join(tokens) - - def single(token): - return token - - def regular_string(token): - def escaped(): - escaped = False - i = -2 - while tokens[i] == '\\': - escaped = not escaped - i -= 1 - return escaped - - tokens = [token] - nestlevel = 1 - for token in primitive: - tokens.append(token) - if token in '()' and not escaped(): - nestlevel += token == '(' or -1 - if not nestlevel: - break - else: - assert 0, "Unexpected end of token stream" - return PdfString(''.join(tokens)) - - def hex_string(token): - tokens = [token] - for token in primitive: - tokens.append(token) - if token == '>': - break - while tokens[-2] == '>>': - tokens.append(tokens.pop(-2)) - return PdfString(''.join(tokens)) - - def normal_data(token): - - # Obscure optimization -- we can get here with - # whitespace or regular character data. If we get - # here with whitespace, then there won't be an additional - # token queued up in the primitive object, otherwise there - # will... - if primitive_tokens: #if token[0] not in whitespaceset: - tokens = [token] - primitive.coalesce(tokens) - return PdfObject(''.join(tokens)) - - def name_string(token): - tokens = [token] - primitive.coalesce(tokens) - token = ''.join(tokens) - if '#' in token: - substrs = token.split('#') - substrs.reverse() - tokens = [substrs.pop()] - while substrs: - s = substrs.pop() - tokens.append(chr(int(s[:2], 16))) - tokens.append(s[2:]) - token = ''.join(tokens) - return PdfObject(token) - - def broken(token): - assert 0, token - - dispatch = { - '(': regular_string, - ')': broken, - '<': hex_string, - '>': broken, - '[': single, - ']': single, - '{': single, - '}': single, - '/': name_string, - '%' : comment, - '<<': single, - '>>': single, - }.get - - class MyIterator(object): - def next(): - while not tokens: - token = primitive_next() - token = dispatch(token, normal_data)(token) - if token: - return token - return tokens.pop() - next = staticmethod(next) - - self.primitive = primitive = _PrimitiveTokens(fdata) - self.setstart = primitive.setstart - primitive.setstart(startloc) - self.fdata = fdata - self.strip_comments = strip_comments - self.tokens = tokens = [] - self.iterator = iterator = MyIterator() - self.next = iterator.next - primitive_next = primitive.next - primitive_tokens = primitive.tokens - whitespaceset = _PrimitiveTokens.whitespaceset - - def floc(self): - return self.primitive.floc() - sum([len(x) for x in self.tokens]) - floc = property(floc) - - def __iter__(self): - return self.iterator - - def multiple(self, count): - next = self.next - return [next() for i in range(count)] -- cgit v1.3