From 9e69adbe1b065707f8be4f146cc3c05660cef711 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Tue, 21 Jun 2011 20:41:18 +0200 Subject: Add pdfrw, and many files that I have forgetten, sorry ! --- lib/pdfrw/pdfreader.py | 213 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 lib/pdfrw/pdfreader.py (limited to 'lib/pdfrw/pdfreader.py') diff --git a/lib/pdfrw/pdfreader.py b/lib/pdfrw/pdfreader.py new file mode 100644 index 0000000..6f57bea --- /dev/null +++ b/lib/pdfrw/pdfreader.py @@ -0,0 +1,213 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +The PdfReader class reads an entire PDF file into memory and +parses the top-level container objects. (It does not parse +into streams.) The object subclasses PdfDict, and the +document pages are stored in a list in the pages attribute +of the object. +''' + +from pdftokens import PdfTokens +from pdfobjects import PdfDict, PdfArray, PdfName +from pdfcompress import uncompress + +class PdfReader(PdfDict): + + class unresolved: + # Used as a placeholder until we have an object. + pass + + def readindirect(self, objnum, gennum): + ''' Read an indirect object. If it has already + been read, return it from the cache. + ''' + + def setobj(obj): + # Store the new object in the dictionary + # once we have its value + record[1] = obj + + def ordinary(source, setobj, obj): + # Deal with an ordinary (non-array, non-dict) object + setobj(obj) + return obj + + fdata, objnum, gennum = self.fdata, int(objnum), int(gennum) + record = self.indirect_objects[fdata, objnum, gennum] + if record[1] is not self.unresolved: + return record[1] + + # Read the object header and validate it + source = PdfTokens(fdata, record[0]) + objid = source.multiple(3) + assert int(objid[0]) == objnum, objid + assert int(objid[1]) == gennum, objid + assert objid[2] == 'obj', objid + + # Read the object, and call special code if it starts + # an array or dictionary + obj = source.next() + obj = self.special.get(obj, ordinary)(source, setobj, obj) + self.readstream(obj, source) + obj.indirect = True + return obj + + def readstream(obj, source): + ''' Read optional stream following a dictionary + object. + ''' + tok = source.next() + if tok == 'endobj': + return # No stream + + assert isinstance(obj, PdfDict) + assert tok == 'stream', tok + fdata = source.fdata + floc = fdata.rindex(tok, 0, source.floc) + len(tok) + ch = fdata[floc] + if ch == '\r': + floc += 1 + ch = fdata[floc] + assert ch == '\n' + startstream = floc + 1 + endstream = startstream + int(obj.Length) + obj._stream = fdata[startstream:endstream] + source = PdfTokens(fdata, endstream) + endit = source.multiple(2) + if endit != 'endstream endobj'.split(): + # /Length attribute is broken, try to read stream + # anyway disregarding the specified value + # TODO: issue warning here once we have some kind of + # logging + endstream = fdata.index('endstream', startstream) + if fdata[endstream-2:endstream] == '\r\n': + endstream -= 2 + elif fdata[endstream-1] in ['\n', '\r']: + endstream -= 1 + source = PdfTokens(fdata, endstream) + endit = source.multiple(2) + assert endit == 'endstream endobj'.split() + obj.Length = str(endstream-startstream) + obj._stream = fdata[startstream:endstream] + readstream = staticmethod(readstream) + + def readarray(self, source, setobj=lambda x:None, original=None): + special = self.special + result = PdfArray() + setobj(result) + + for value in source: + if value == ']': + break + if value in special: + value = special[value](source) + elif value == 'R': + generation = result.pop() + value = self.readindirect(result.pop(), generation) + result.append(value) + return result + + def readdict(self, source, setobj=lambda x:None, original=None): + special = self.special + result = PdfDict() + setobj(result) + + tok = source.next() + while tok != '>>': + assert tok.startswith('/'), (tok, source.multiple(10)) + key = tok + value = source.next() + if value in special: + value = special[value](source) + tok = source.next() + else: + tok = source.next() + if value.isdigit() and tok.isdigit(): + assert source.next() == 'R' + value = self.readindirect(value, tok) + tok = source.next() + result[key] = value + + return result + + def readxref(fdata): + startloc = fdata.rindex('startxref') + xrefinfo = list(PdfTokens(fdata, startloc, False)) + assert len(xrefinfo) == 3, xrefinfo + assert xrefinfo[0] == 'startxref', xrefinfo[0] + assert xrefinfo[1].isdigit(), xrefinfo[1] + assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2]) + return startloc, PdfTokens(fdata, int(xrefinfo[1])) + readxref = staticmethod(readxref) + + def parsexref(self, source): + tok = source.next() + assert tok == 'xref', tok + while 1: + tok = source.next() + if tok == 'trailer': + break + startobj = int(tok) + for objnum in range(startobj, startobj + int(source.next())): + offset = int(source.next()) + generation = int(source.next()) + if source.next() == 'n': + objid = self.fdata, objnum, generation + objval = [offset, self.unresolved] + self.indirect_objects.setdefault(objid, objval) + + pagename = PdfName.Page + pagesname = PdfName.Pages + + def readpages(self, node): + # PDFs can have arbitrarily nested Pages/Page + # dictionary structures. + if node.Type == self.pagename: + return [node] + assert node.Type == self.pagesname, node.Type + result = [] + for node in node.Kids: + result.extend(self.readpages(node)) + return result + + def __init__(self, fname=None, fdata=None, decompress=True): + + if fname is not None: + assert fdata is None + # Allow reading preexisting streams like pyPdf + if hasattr(fname, 'read'): + fdata = fname.read() + else: + f = open(fname, 'rb') + fdata = f.read() + f.close() + + assert fdata is not None + fdata = fdata.rstrip('\00') + self.private.fdata = fdata + + self.private.indirect_objects = {} + self.private.special = {'<<': self.readdict, '[': self.readarray} + + startloc, source = self.readxref(fdata) + self.parsexref(source) + assert source.next() == '<<' + self.update(self.readdict(source)) + assert source.next() == 'startxref' and source.floc > startloc + self.private.pages = self.readpages(self.Root.Pages) + if decompress: + self.uncompress() + + # For compatibility with pyPdf + self.private.numPages = len(self.pages) + + + # For compatibility with pyPdf + def getPage(self, pagenum): + return self.pages[pagenum] + + def uncompress(self): + uncompress([x[1] for x in self.indirect_objects.itervalues()]) -- cgit v1.3