From 4bd3e47da02fde08acfada1795cc55170abdb00a Mon Sep 17 00:00:00 2001 From: jvoisin Date: Tue, 16 Aug 2011 18:11:24 +0200 Subject: setup.py now works ! --- lib/pdfrw/pdfreader.py | 213 ------------------------------------------------- 1 file changed, 213 deletions(-) delete mode 100644 lib/pdfrw/pdfreader.py (limited to 'lib/pdfrw/pdfreader.py') diff --git a/lib/pdfrw/pdfreader.py b/lib/pdfrw/pdfreader.py deleted file mode 100644 index 6f57bea..0000000 --- a/lib/pdfrw/pdfreader.py +++ /dev/null @@ -1,213 +0,0 @@ -# A part of pdfrw (pdfrw.googlecode.com) -# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas -# MIT license -- See LICENSE.txt for details - -''' -The PdfReader class reads an entire PDF file into memory and -parses the top-level container objects. (It does not parse -into streams.) The object subclasses PdfDict, and the -document pages are stored in a list in the pages attribute -of the object. -''' - -from pdftokens import PdfTokens -from pdfobjects import PdfDict, PdfArray, PdfName -from pdfcompress import uncompress - -class PdfReader(PdfDict): - - class unresolved: - # Used as a placeholder until we have an object. - pass - - def readindirect(self, objnum, gennum): - ''' Read an indirect object. If it has already - been read, return it from the cache. - ''' - - def setobj(obj): - # Store the new object in the dictionary - # once we have its value - record[1] = obj - - def ordinary(source, setobj, obj): - # Deal with an ordinary (non-array, non-dict) object - setobj(obj) - return obj - - fdata, objnum, gennum = self.fdata, int(objnum), int(gennum) - record = self.indirect_objects[fdata, objnum, gennum] - if record[1] is not self.unresolved: - return record[1] - - # Read the object header and validate it - source = PdfTokens(fdata, record[0]) - objid = source.multiple(3) - assert int(objid[0]) == objnum, objid - assert int(objid[1]) == gennum, objid - assert objid[2] == 'obj', objid - - # Read the object, and call special code if it starts - # an array or dictionary - obj = source.next() - obj = self.special.get(obj, ordinary)(source, setobj, obj) - self.readstream(obj, source) - obj.indirect = True - return obj - - def readstream(obj, source): - ''' Read optional stream following a dictionary - object. - ''' - tok = source.next() - if tok == 'endobj': - return # No stream - - assert isinstance(obj, PdfDict) - assert tok == 'stream', tok - fdata = source.fdata - floc = fdata.rindex(tok, 0, source.floc) + len(tok) - ch = fdata[floc] - if ch == '\r': - floc += 1 - ch = fdata[floc] - assert ch == '\n' - startstream = floc + 1 - endstream = startstream + int(obj.Length) - obj._stream = fdata[startstream:endstream] - source = PdfTokens(fdata, endstream) - endit = source.multiple(2) - if endit != 'endstream endobj'.split(): - # /Length attribute is broken, try to read stream - # anyway disregarding the specified value - # TODO: issue warning here once we have some kind of - # logging - endstream = fdata.index('endstream', startstream) - if fdata[endstream-2:endstream] == '\r\n': - endstream -= 2 - elif fdata[endstream-1] in ['\n', '\r']: - endstream -= 1 - source = PdfTokens(fdata, endstream) - endit = source.multiple(2) - assert endit == 'endstream endobj'.split() - obj.Length = str(endstream-startstream) - obj._stream = fdata[startstream:endstream] - readstream = staticmethod(readstream) - - def readarray(self, source, setobj=lambda x:None, original=None): - special = self.special - result = PdfArray() - setobj(result) - - for value in source: - if value == ']': - break - if value in special: - value = special[value](source) - elif value == 'R': - generation = result.pop() - value = self.readindirect(result.pop(), generation) - result.append(value) - return result - - def readdict(self, source, setobj=lambda x:None, original=None): - special = self.special - result = PdfDict() - setobj(result) - - tok = source.next() - while tok != '>>': - assert tok.startswith('/'), (tok, source.multiple(10)) - key = tok - value = source.next() - if value in special: - value = special[value](source) - tok = source.next() - else: - tok = source.next() - if value.isdigit() and tok.isdigit(): - assert source.next() == 'R' - value = self.readindirect(value, tok) - tok = source.next() - result[key] = value - - return result - - def readxref(fdata): - startloc = fdata.rindex('startxref') - xrefinfo = list(PdfTokens(fdata, startloc, False)) - assert len(xrefinfo) == 3, xrefinfo - assert xrefinfo[0] == 'startxref', xrefinfo[0] - assert xrefinfo[1].isdigit(), xrefinfo[1] - assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2]) - return startloc, PdfTokens(fdata, int(xrefinfo[1])) - readxref = staticmethod(readxref) - - def parsexref(self, source): - tok = source.next() - assert tok == 'xref', tok - while 1: - tok = source.next() - if tok == 'trailer': - break - startobj = int(tok) - for objnum in range(startobj, startobj + int(source.next())): - offset = int(source.next()) - generation = int(source.next()) - if source.next() == 'n': - objid = self.fdata, objnum, generation - objval = [offset, self.unresolved] - self.indirect_objects.setdefault(objid, objval) - - pagename = PdfName.Page - pagesname = PdfName.Pages - - def readpages(self, node): - # PDFs can have arbitrarily nested Pages/Page - # dictionary structures. - if node.Type == self.pagename: - return [node] - assert node.Type == self.pagesname, node.Type - result = [] - for node in node.Kids: - result.extend(self.readpages(node)) - return result - - def __init__(self, fname=None, fdata=None, decompress=True): - - if fname is not None: - assert fdata is None - # Allow reading preexisting streams like pyPdf - if hasattr(fname, 'read'): - fdata = fname.read() - else: - f = open(fname, 'rb') - fdata = f.read() - f.close() - - assert fdata is not None - fdata = fdata.rstrip('\00') - self.private.fdata = fdata - - self.private.indirect_objects = {} - self.private.special = {'<<': self.readdict, '[': self.readarray} - - startloc, source = self.readxref(fdata) - self.parsexref(source) - assert source.next() == '<<' - self.update(self.readdict(source)) - assert source.next() == 'startxref' and source.floc > startloc - self.private.pages = self.readpages(self.Root.Pages) - if decompress: - self.uncompress() - - # For compatibility with pyPdf - self.private.numPages = len(self.pages) - - - # For compatibility with pyPdf - def getPage(self, pagenum): - return self.pages[pagenum] - - def uncompress(self): - uncompress([x[1] for x in self.indirect_objects.itervalues()]) -- cgit v1.3