From 9e69adbe1b065707f8be4f146cc3c05660cef711 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Tue, 21 Jun 2011 20:41:18 +0200 Subject: Add pdfrw, and many files that I have forgetten, sorry ! --- lib/pdfrw/buildxobj.py | 191 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 lib/pdfrw/buildxobj.py (limited to 'lib/pdfrw/buildxobj.py') diff --git a/lib/pdfrw/buildxobj.py b/lib/pdfrw/buildxobj.py new file mode 100644 index 0000000..203dd8c --- /dev/null +++ b/lib/pdfrw/buildxobj.py @@ -0,0 +1,191 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' + +This module contains code to build PDF "Form XObjects". + +A Form XObject allows a fragment from one PDF file to be cleanly +included in another PDF file. + +Reference for syntax: "Parameters for opening PDF files" from SDK 8.1 + + http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf + + supported 'page=xxx', 'viewrect=,,,' + + Units are in points + +Reference for content: Adobe PDF reference, sixth edition, version 1.7 + + http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf + + Form xobjects discussed chapter 4.9, page 355 +''' + +from pdfobjects import PdfDict, PdfArray, PdfName +from pdfreader import PdfReader + +class ViewInfo(object): + ''' Instantiate ViewInfo with a uri, and it will parse out + the filename, page, and viewrect into object attributes. + ''' + doc = None + docname = None + page = None + viewrect = None + + def __init__(self, pageinfo='', **kw): + pageinfo=pageinfo.split('#',1) + if len(pageinfo) == 2: + pageinfo[1:] = pageinfo[1].replace('&', '#').split('#') + for key in 'page viewrect'.split(): + if pageinfo[0].startswith(key+'='): + break + else: + self.docname = pageinfo.pop(0) + for item in pageinfo: + key, value = item.split('=') + key = key.strip() + value = value.replace(',', ' ').split() + if key == 'page': + assert len(value) == 1 + setattr(self, key, int(value[0])) + elif key == 'viewrect': + assert len(value) == 4 + setattr(self, key, [float(x) for x in value]) + else: + log.error('Unknown option: %s', key) + for key, value in kw.iteritems(): + assert hasattr(self, key), key + setattr(self, key, value) + +def getrects(inheritable, pageinfo): + ''' Given the inheritable attributes of a page and + the desired pageinfo rectangle, return the page's + media box and the calculated boundary (clip) box. + ''' + mbox = tuple([float(x) for x in inheritable.MediaBox]) + vrect = pageinfo.viewrect + if vrect is None: + cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)]) + else: + mleft, mbot, mright, mtop = mbox + x, y, w, h = vrect + cleft = mleft + x + ctop = mtop - y + cright = cleft + w + cbot = ctop - h + cbox = max(mleft, cleft), max(mbot, cbot), min(mright, cright), min(mtop, ctop) + return mbox, cbox + +def _cache_xobj(contents, resources, mbox, bbox): + ''' Return a cached Form XObject, or create a new one and cache it. + ''' + cachedict = contents.xobj_cachedict + if cachedict is None: + cachedict = contents.private.xobj_cachedict = {} + result = cachedict.get(bbox) + if result is None: + func = (_get_fullpage, _get_subpage)[mbox != bbox] + result = PdfDict( + func(contents, resources, mbox, bbox), + Type = PdfName.XObject, + Subtype = PdfName.Form, + FormType = 1, + BBox = PdfArray(bbox), + ) + cachedict[bbox] = result + return result + +def _get_fullpage(contents, resources, mbox, bbox): + ''' fullpage is easy. Just copy the contents, + set up the resources, and let _cache_xobj handle the + rest. + ''' + return PdfDict(contents, Resources=resources) + +def _get_subpage(contents, resources, mbox, bbox): + ''' subpages *could* be as easy as full pages, but we + choose to complicate life by creating a Form XObject + for the page, and then one that references it for + the subpage, on the off-chance that we want multiple + items from the page. + ''' + return PdfDict( + stream = '/FullPage Do\n', + Resources = PdfDict( + XObject = PdfDict( + FullPage = _cache_xobj(contents, resources, mbox, mbox) + ) + ) + ) + +def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): + ''' pagexobj creates and returns a Form XObject for + a given view within a page (Defaults to entire page.) + ''' + inheritable = page.inheritable + resources = inheritable.Resources + mbox, bbox = getrects(inheritable, viewinfo) + contents = page.Contents + # Make sure the only attribute is length + # All the filters must have been executed + assert int(contents.Length) == len(contents.stream) + if not allow_compressed: + assert len([x for x in contents.iteritems()]) == 1 + + return _cache_xobj(contents, resources, mbox, bbox) + + +def docxobj(pageinfo, doc=None, allow_compressed=True): + ''' docxobj creates and returns an actual Form XObject. + Can work standalone, or in conjunction with + the CacheXObj class (below). + ''' + if not isinstance(pageinfo, ViewInfo): + pageinfo = ViewInfo(pageinfo) + + # If we're explicitly passed a document, + # make sure we don't have one implicitly as well. + # If no implicit or explicit doc, then read one in + # from the filename. + if doc is not None: + assert pageinfo.doc is None + pageinfo.doc = doc + elif pageinfo.doc is not None: + doc = pageinfo.doc + else: + doc = pageinfo.doc = PdfReader(pageinfo.docname, decompress = not allow_compressed) + assert isinstance(doc, PdfReader) + + sourcepage = doc.pages[(pageinfo.page or 1) - 1] + return pagexobj(sourcepage, pageinfo, allow_compressed) + + +class CacheXObj(object): + ''' Use to keep from reparsing files over and over, + and to keep from making the output too much + bigger than it ought to be by replicating + unnecessary object copies. + ''' + def __init__(self, decompress=False): + ''' Set decompress true if you need + the Form XObjects to be decompressed. + Will decompress what it can and scream + about the rest. + ''' + self.cached_pdfs = {} + self.decompress = decompress + + def load(self, sourcename): + ''' Load a Form XObject from a uri + ''' + info = ViewInfo(sourcename) + fname = info.docname + pcache = self.cached_pdfs + doc = pcache.get(fname) + if doc is None: + doc = pcache[fname] = PdfReader(fname, decompress=self.decompress) + return docxobj(info, doc, allow_compressed=not self.decompress) -- cgit v1.3