From 9e69adbe1b065707f8be4f146cc3c05660cef711 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Tue, 21 Jun 2011 20:41:18 +0200 Subject: Add pdfrw, and many files that I have forgetten, sorry ! --- lib/archive.py | 6 ++ lib/audio.py | 8 ++ lib/mat.py | 2 + lib/misc.py | 44 +++++++++ lib/pdfrw/__init__.py | 13 +++ lib/pdfrw/buildxobj.py | 191 ++++++++++++++++++++++++++++++++++++ lib/pdfrw/pdfcompress.py | 57 +++++++++++ lib/pdfrw/pdfobjects.py | 183 ++++++++++++++++++++++++++++++++++ lib/pdfrw/pdfreader.py | 213 ++++++++++++++++++++++++++++++++++++++++ lib/pdfrw/pdftokens.py | 249 +++++++++++++++++++++++++++++++++++++++++++++++ lib/pdfrw/pdfwriter.py | 234 ++++++++++++++++++++++++++++++++++++++++++++ lib/pdfrw/toreportlab.py | 139 ++++++++++++++++++++++++++ lib/sounds.py | 1 + 13 files changed, 1340 insertions(+) create mode 100644 lib/archive.py create mode 100644 lib/audio.py create mode 100644 lib/misc.py create mode 100644 lib/pdfrw/__init__.py create mode 100644 lib/pdfrw/buildxobj.py create mode 100644 lib/pdfrw/pdfcompress.py create mode 100644 lib/pdfrw/pdfobjects.py create mode 100644 lib/pdfrw/pdfreader.py create mode 100644 lib/pdfrw/pdftokens.py create mode 100755 lib/pdfrw/pdfwriter.py create mode 100644 lib/pdfrw/toreportlab.py create mode 100644 lib/sounds.py (limited to 'lib') diff --git a/lib/archive.py b/lib/archive.py new file mode 100644 index 0000000..6378cab --- /dev/null +++ b/lib/archive.py @@ -0,0 +1,6 @@ +import parser + +class TarStripper(parser.Generic_parser): + def remove_all(self): + for file in self.editor.array("file"): + print file.name diff --git a/lib/audio.py b/lib/audio.py new file mode 100644 index 0000000..6d653bc --- /dev/null +++ b/lib/audio.py @@ -0,0 +1,8 @@ +import parser + +class MpegAudioStripper(parser.Generic_parser): + def _should_remove(self, field): + if field.name in ("id3v1", "id3v2"): + return True + else: + return False diff --git a/lib/mat.py b/lib/mat.py index 3cbd81b..a9b8e17 100644 --- a/lib/mat.py +++ b/lib/mat.py @@ -14,6 +14,7 @@ import hachoir_editor import images import audio import misc +import archive __version__ = "0.1" __author__ = "jvoisin" @@ -23,6 +24,7 @@ strippers = { hachoir_parser.image.PngFile: images.PngStripper, hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, hachoir_parser.misc.PDFDocument: misc.PdfStripper, + hachoir_parser.archive.TarFile: archive.TarStripper, } def create_class_file(name): diff --git a/lib/misc.py b/lib/misc.py new file mode 100644 index 0000000..56c2274 --- /dev/null +++ b/lib/misc.py @@ -0,0 +1,44 @@ +import parser +import pdfrw + +class PdfStripper(parser.Generic_parser): + ''' + Represent a pdf file, with the help of pdfrw + ''' + def __init__(self, filename): + self.filename = filename + self.trailer = pdfrw.PdfReader(self.filename) + self.writer = pdfrw.PdfWriter() + + def remove_all(self): + ''' + Remove all the files that are compromizing + ''' + self.trailer.Info.Title = '' + self.trailer.Info.Author = '' + self.trailer.Info.Producer = '' + self.trailer.Info.Creator = '' + self.trailer.Info.CreationDate = '' + self.trailer.Info.ModDate = '' + + self.writer.trailer = self.trailer + self.writer.write(self.filename + parser.POSTFIX) + + def is_clean(self): + ''' + Check if the file is clean from harmful metadatas + ''' + for field in self.trailer.Info: + if field != '': + return False + return True + + def get_meta(self): + ''' + return a dict with all the meta of the file + ''' + metadata = {} + for key, value in self.trailer.Info.iteritems(): + metadata[key[1:]] = value[1:-1] + return metadata + diff --git a/lib/pdfrw/__init__.py b/lib/pdfrw/__init__.py new file mode 100644 index 0000000..964972f --- /dev/null +++ b/lib/pdfrw/__init__.py @@ -0,0 +1,13 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +from pdfwriter import PdfWriter +from pdfreader import PdfReader +from pdfobjects import PdfObject, PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfString +from pdftokens import PdfTokens + +# Add a tiny bit of compatibility to pyPdf + +PdfFileReader = PdfReader +PdfFileWriter = PdfWriter diff --git a/lib/pdfrw/buildxobj.py b/lib/pdfrw/buildxobj.py new file mode 100644 index 0000000..203dd8c --- /dev/null +++ b/lib/pdfrw/buildxobj.py @@ -0,0 +1,191 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' + +This module contains code to build PDF "Form XObjects". + +A Form XObject allows a fragment from one PDF file to be cleanly +included in another PDF file. + +Reference for syntax: "Parameters for opening PDF files" from SDK 8.1 + + http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf + + supported 'page=xxx', 'viewrect=,,,' + + Units are in points + +Reference for content: Adobe PDF reference, sixth edition, version 1.7 + + http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf + + Form xobjects discussed chapter 4.9, page 355 +''' + +from pdfobjects import PdfDict, PdfArray, PdfName +from pdfreader import PdfReader + +class ViewInfo(object): + ''' Instantiate ViewInfo with a uri, and it will parse out + the filename, page, and viewrect into object attributes. + ''' + doc = None + docname = None + page = None + viewrect = None + + def __init__(self, pageinfo='', **kw): + pageinfo=pageinfo.split('#',1) + if len(pageinfo) == 2: + pageinfo[1:] = pageinfo[1].replace('&', '#').split('#') + for key in 'page viewrect'.split(): + if pageinfo[0].startswith(key+'='): + break + else: + self.docname = pageinfo.pop(0) + for item in pageinfo: + key, value = item.split('=') + key = key.strip() + value = value.replace(',', ' ').split() + if key == 'page': + assert len(value) == 1 + setattr(self, key, int(value[0])) + elif key == 'viewrect': + assert len(value) == 4 + setattr(self, key, [float(x) for x in value]) + else: + log.error('Unknown option: %s', key) + for key, value in kw.iteritems(): + assert hasattr(self, key), key + setattr(self, key, value) + +def getrects(inheritable, pageinfo): + ''' Given the inheritable attributes of a page and + the desired pageinfo rectangle, return the page's + media box and the calculated boundary (clip) box. + ''' + mbox = tuple([float(x) for x in inheritable.MediaBox]) + vrect = pageinfo.viewrect + if vrect is None: + cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)]) + else: + mleft, mbot, mright, mtop = mbox + x, y, w, h = vrect + cleft = mleft + x + ctop = mtop - y + cright = cleft + w + cbot = ctop - h + cbox = max(mleft, cleft), max(mbot, cbot), min(mright, cright), min(mtop, ctop) + return mbox, cbox + +def _cache_xobj(contents, resources, mbox, bbox): + ''' Return a cached Form XObject, or create a new one and cache it. + ''' + cachedict = contents.xobj_cachedict + if cachedict is None: + cachedict = contents.private.xobj_cachedict = {} + result = cachedict.get(bbox) + if result is None: + func = (_get_fullpage, _get_subpage)[mbox != bbox] + result = PdfDict( + func(contents, resources, mbox, bbox), + Type = PdfName.XObject, + Subtype = PdfName.Form, + FormType = 1, + BBox = PdfArray(bbox), + ) + cachedict[bbox] = result + return result + +def _get_fullpage(contents, resources, mbox, bbox): + ''' fullpage is easy. Just copy the contents, + set up the resources, and let _cache_xobj handle the + rest. + ''' + return PdfDict(contents, Resources=resources) + +def _get_subpage(contents, resources, mbox, bbox): + ''' subpages *could* be as easy as full pages, but we + choose to complicate life by creating a Form XObject + for the page, and then one that references it for + the subpage, on the off-chance that we want multiple + items from the page. + ''' + return PdfDict( + stream = '/FullPage Do\n', + Resources = PdfDict( + XObject = PdfDict( + FullPage = _cache_xobj(contents, resources, mbox, mbox) + ) + ) + ) + +def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): + ''' pagexobj creates and returns a Form XObject for + a given view within a page (Defaults to entire page.) + ''' + inheritable = page.inheritable + resources = inheritable.Resources + mbox, bbox = getrects(inheritable, viewinfo) + contents = page.Contents + # Make sure the only attribute is length + # All the filters must have been executed + assert int(contents.Length) == len(contents.stream) + if not allow_compressed: + assert len([x for x in contents.iteritems()]) == 1 + + return _cache_xobj(contents, resources, mbox, bbox) + + +def docxobj(pageinfo, doc=None, allow_compressed=True): + ''' docxobj creates and returns an actual Form XObject. + Can work standalone, or in conjunction with + the CacheXObj class (below). + ''' + if not isinstance(pageinfo, ViewInfo): + pageinfo = ViewInfo(pageinfo) + + # If we're explicitly passed a document, + # make sure we don't have one implicitly as well. + # If no implicit or explicit doc, then read one in + # from the filename. + if doc is not None: + assert pageinfo.doc is None + pageinfo.doc = doc + elif pageinfo.doc is not None: + doc = pageinfo.doc + else: + doc = pageinfo.doc = PdfReader(pageinfo.docname, decompress = not allow_compressed) + assert isinstance(doc, PdfReader) + + sourcepage = doc.pages[(pageinfo.page or 1) - 1] + return pagexobj(sourcepage, pageinfo, allow_compressed) + + +class CacheXObj(object): + ''' Use to keep from reparsing files over and over, + and to keep from making the output too much + bigger than it ought to be by replicating + unnecessary object copies. + ''' + def __init__(self, decompress=False): + ''' Set decompress true if you need + the Form XObjects to be decompressed. + Will decompress what it can and scream + about the rest. + ''' + self.cached_pdfs = {} + self.decompress = decompress + + def load(self, sourcename): + ''' Load a Form XObject from a uri + ''' + info = ViewInfo(sourcename) + fname = info.docname + pcache = self.cached_pdfs + doc = pcache.get(fname) + if doc is None: + doc = pcache[fname] = PdfReader(fname, decompress=self.decompress) + return docxobj(info, doc, allow_compressed=not self.decompress) diff --git a/lib/pdfrw/pdfcompress.py b/lib/pdfrw/pdfcompress.py new file mode 100644 index 0000000..1c11970 --- /dev/null +++ b/lib/pdfrw/pdfcompress.py @@ -0,0 +1,57 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +Currently, this sad little file only knows how to decompress +using the flate (zlib) algorithm. Maybe more later, but it's +not a priority for me... +''' + +from __future__ import generators + +try: + set +except NameError: + from sets import Set as set + +import zlib +from pdfobjects import PdfDict, PdfName + + +def streamobjects(mylist): + for obj in mylist: + if isinstance(obj, PdfDict) and obj.stream is not None: + yield obj + +def uncompress(mylist, warnings=set()): + flate = PdfName.FlateDecode + for obj in streamobjects(mylist): + ftype = obj.Filter + if ftype is None: + continue + if isinstance(ftype, list) and len(ftype) == 1: + # todo: multiple filters + ftype = ftype[0] + parms = obj.DecodeParms + if ftype != flate or parms is not None: + msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms)) + if msg not in warnings: + warnings.add(msg) + print msg + else: + obj.stream = zlib.decompress(obj.stream) + obj.Filter = None + +def compress(mylist): + flate = PdfName.FlateDecode + for obj in streamobjects(mylist): + ftype = obj.Filter + if ftype is not None: + continue + oldstr = obj.stream + newstr = zlib.compress(oldstr) + if len(newstr) < len(oldstr) + 30: + obj.stream = newstr + obj.Filter = flate + obj.DecodeParms = None diff --git a/lib/pdfrw/pdfobjects.py b/lib/pdfrw/pdfobjects.py new file mode 100644 index 0000000..08ad825 --- /dev/null +++ b/lib/pdfrw/pdfobjects.py @@ -0,0 +1,183 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +Objects that can occur in PDF files. The most important +objects are arrays and dicts. Either of these can be +indirect or not, and dicts could have an associated +stream. +''' +from __future__ import generators + +try: + set +except NameError: + from sets import Set as set + +import re + +class PdfObject(str): + indirect = False + +class PdfArray(list): + indirect = False + +class PdfName(object): + def __getattr__(self, name): + return self(name) + def __call__(self, name): + return PdfObject('/' + name) + +PdfName = PdfName() + +class PdfString(str): + indirect = False + unescape_dict = {'\\b':'\b', '\\f':'\f', '\\n':'\n', + '\\r':'\r', '\\t':'\t', + '\\\r\n': '', '\\\r':'', '\\\n':'', + '\\\\':'\\', '\\':'', + } + unescape_pattern = r'(\\b|\\f|\\n|\\r|\\t|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)' + unescape_func = re.compile(unescape_pattern).split + + hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])' + hex_func = re.compile(hex_pattern).split + + hex_pattern2 = '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])' + hex_func2 = re.compile(hex_pattern2).split + + hex_funcs = hex_func, hex_func2 + + indirect = False + + def decode_regular(self, remap=chr): + assert self[0] == '(' and self[-1] == ')' + mylist = self.unescape_func(self[1:-1]) + result = [] + unescape = self.unescape_dict.get + for chunk in mylist: + chunk = unescape(chunk, chunk) + if chunk.startswith('\\') and len(chunk) > 1: + value = int(chunk[1:], 8) + # FIXME: TODO: Handle unicode here + if value > 127: + value = 127 + chunk = remap(value) + if chunk: + result.append(chunk) + return ''.join(result) + + def decode_hex(self, remap=chr, twobytes=False): + data = ''.join(self.split()) + data = self.hex_funcs[twobytes](data) + chars = data[1::2] + other = data[0::2] + assert other[0] == '<' and other[-1] == '>' and ''.join(other) == '<>', self + return ''.join([remap(int(x, 16)) for x in chars]) + + def decode(self, remap=chr, twobytes=False): + if self.startswith('('): + return self.decode_regular(remap) + + else: + return self.decode_hex(remap, twobytes) + + def encode(cls, source, usehex=False): + assert not usehex, "Not supported yet" + if isinstance(source, unicode): + source = source.encode('utf-8') + else: + source = str(source) + source = source.replace('\\', '\\\\') + source = source.replace('(', '\\(') + source = source.replace(')', '\\)') + return cls('(' +source + ')') + encode = classmethod(encode) + +class PdfDict(dict): + indirect = False + stream = None + + _special = dict(indirect = ('indirect', False), + stream = ('stream', True), + _stream = ('stream', False), + ) + + def __setitem__(self, name, value): + assert name.startswith('/'), name + if value is not None: + dict.__setitem__(self, name, value) + elif name in self: + del self[name] + + def __init__(self, *args, **kw): + if args: + if len(args) == 1: + args = args[0] + self.update(args) + if isinstance(args, PdfDict): + self.indirect = args.indirect + self._stream = args.stream + for key, value in kw.iteritems(): + setattr(self, key, value) + + def __getattr__(self, name): + return self.get(PdfName(name)) + + def __setattr__(self, name, value): + info = self._special.get(name) + if info is None: + self[PdfName(name)] = value + else: + name, setlen = info + self.__dict__[name] = value + if setlen: + notnone = value is not None + self.Length = notnone and PdfObject(len(value)) or None + + def iteritems(self): + for key, value in dict.iteritems(self): + if value is not None: + assert key.startswith('/'), (key, value) + yield key, value + + def inheritable(self): + ''' Search through ancestors as needed for inheritable + dictionary items + ''' + class Search(object): + def __init__(self, basedict): + self.basedict = basedict + def __getattr__(self, name): + return self[name] + def __getitem__(self, name): + visited = set() + mydict = self.basedict + while 1: + value = getattr(mydict, name) + if value is not None: + return value + myid = id(mydict) + assert myid not in visited + visited.add(myid) + mydict = mydict.Parent + if mydict is None: + return + return Search(self) + inheritable = property(inheritable) + + def private(self): + ''' Allows setting private metadata for use in + processing (not sent to PDF file) + ''' + class Private(object): + pass + + result = Private() + result.__dict__ = self.__dict__ + return result + private = property(private) + +class IndirectPdfDict(PdfDict): + indirect = True diff --git a/lib/pdfrw/pdfreader.py b/lib/pdfrw/pdfreader.py new file mode 100644 index 0000000..6f57bea --- /dev/null +++ b/lib/pdfrw/pdfreader.py @@ -0,0 +1,213 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +The PdfReader class reads an entire PDF file into memory and +parses the top-level container objects. (It does not parse +into streams.) The object subclasses PdfDict, and the +document pages are stored in a list in the pages attribute +of the object. +''' + +from pdftokens import PdfTokens +from pdfobjects import PdfDict, PdfArray, PdfName +from pdfcompress import uncompress + +class PdfReader(PdfDict): + + class unresolved: + # Used as a placeholder until we have an object. + pass + + def readindirect(self, objnum, gennum): + ''' Read an indirect object. If it has already + been read, return it from the cache. + ''' + + def setobj(obj): + # Store the new object in the dictionary + # once we have its value + record[1] = obj + + def ordinary(source, setobj, obj): + # Deal with an ordinary (non-array, non-dict) object + setobj(obj) + return obj + + fdata, objnum, gennum = self.fdata, int(objnum), int(gennum) + record = self.indirect_objects[fdata, objnum, gennum] + if record[1] is not self.unresolved: + return record[1] + + # Read the object header and validate it + source = PdfTokens(fdata, record[0]) + objid = source.multiple(3) + assert int(objid[0]) == objnum, objid + assert int(objid[1]) == gennum, objid + assert objid[2] == 'obj', objid + + # Read the object, and call special code if it starts + # an array or dictionary + obj = source.next() + obj = self.special.get(obj, ordinary)(source, setobj, obj) + self.readstream(obj, source) + obj.indirect = True + return obj + + def readstream(obj, source): + ''' Read optional stream following a dictionary + object. + ''' + tok = source.next() + if tok == 'endobj': + return # No stream + + assert isinstance(obj, PdfDict) + assert tok == 'stream', tok + fdata = source.fdata + floc = fdata.rindex(tok, 0, source.floc) + len(tok) + ch = fdata[floc] + if ch == '\r': + floc += 1 + ch = fdata[floc] + assert ch == '\n' + startstream = floc + 1 + endstream = startstream + int(obj.Length) + obj._stream = fdata[startstream:endstream] + source = PdfTokens(fdata, endstream) + endit = source.multiple(2) + if endit != 'endstream endobj'.split(): + # /Length attribute is broken, try to read stream + # anyway disregarding the specified value + # TODO: issue warning here once we have some kind of + # logging + endstream = fdata.index('endstream', startstream) + if fdata[endstream-2:endstream] == '\r\n': + endstream -= 2 + elif fdata[endstream-1] in ['\n', '\r']: + endstream -= 1 + source = PdfTokens(fdata, endstream) + endit = source.multiple(2) + assert endit == 'endstream endobj'.split() + obj.Length = str(endstream-startstream) + obj._stream = fdata[startstream:endstream] + readstream = staticmethod(readstream) + + def readarray(self, source, setobj=lambda x:None, original=None): + special = self.special + result = PdfArray() + setobj(result) + + for value in source: + if value == ']': + break + if value in special: + value = special[value](source) + elif value == 'R': + generation = result.pop() + value = self.readindirect(result.pop(), generation) + result.append(value) + return result + + def readdict(self, source, setobj=lambda x:None, original=None): + special = self.special + result = PdfDict() + setobj(result) + + tok = source.next() + while tok != '>>': + assert tok.startswith('/'), (tok, source.multiple(10)) + key = tok + value = source.next() + if value in special: + value = special[value](source) + tok = source.next() + else: + tok = source.next() + if value.isdigit() and tok.isdigit(): + assert source.next() == 'R' + value = self.readindirect(value, tok) + tok = source.next() + result[key] = value + + return result + + def readxref(fdata): + startloc = fdata.rindex('startxref') + xrefinfo = list(PdfTokens(fdata, startloc, False)) + assert len(xrefinfo) == 3, xrefinfo + assert xrefinfo[0] == 'startxref', xrefinfo[0] + assert xrefinfo[1].isdigit(), xrefinfo[1] + assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2]) + return startloc, PdfTokens(fdata, int(xrefinfo[1])) + readxref = staticmethod(readxref) + + def parsexref(self, source): + tok = source.next() + assert tok == 'xref', tok + while 1: + tok = source.next() + if tok == 'trailer': + break + startobj = int(tok) + for objnum in range(startobj, startobj + int(source.next())): + offset = int(source.next()) + generation = int(source.next()) + if source.next() == 'n': + objid = self.fdata, objnum, generation + objval = [offset, self.unresolved] + self.indirect_objects.setdefault(objid, objval) + + pagename = PdfName.Page + pagesname = PdfName.Pages + + def readpages(self, node): + # PDFs can have arbitrarily nested Pages/Page + # dictionary structures. + if node.Type == self.pagename: + return [node] + assert node.Type == self.pagesname, node.Type + result = [] + for node in node.Kids: + result.extend(self.readpages(node)) + return result + + def __init__(self, fname=None, fdata=None, decompress=True): + + if fname is not None: + assert fdata is None + # Allow reading preexisting streams like pyPdf + if hasattr(fname, 'read'): + fdata = fname.read() + else: + f = open(fname, 'rb') + fdata = f.read() + f.close() + + assert fdata is not None + fdata = fdata.rstrip('\00') + self.private.fdata = fdata + + self.private.indirect_objects = {} + self.private.special = {'<<': self.readdict, '[': self.readarray} + + startloc, source = self.readxref(fdata) + self.parsexref(source) + assert source.next() == '<<' + self.update(self.readdict(source)) + assert source.next() == 'startxref' and source.floc > startloc + self.private.pages = self.readpages(self.Root.Pages) + if decompress: + self.uncompress() + + # For compatibility with pyPdf + self.private.numPages = len(self.pages) + + + # For compatibility with pyPdf + def getPage(self, pagenum): + return self.pages[pagenum] + + def uncompress(self): + uncompress([x[1] for x in self.indirect_objects.itervalues()]) diff --git a/lib/pdfrw/pdftokens.py b/lib/pdfrw/pdftokens.py new file mode 100644 index 0000000..04bd559 --- /dev/null +++ b/lib/pdfrw/pdftokens.py @@ -0,0 +1,249 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +A tokenizer for PDF streams. + +In general, documentation used was "PDF reference", +sixth edition, for PDF version 1.7, dated November 2006. + +''' + +from __future__ import generators + +try: + set +except NameError: + from sets import Set as set + +import re +from pdfobjects import PdfString, PdfObject + +class _PrimitiveTokens(object): + + # Table 3.1, page 50 of reference, defines whitespace + whitespaceset = set('\x00\t\n\f\r ') + + + # Text on page 50 defines delimiter characters + delimiterset = set('()<>{}[]/%') + + # Coalesce contiguous whitespace into a single token + whitespace_pattern = '[%s]+' % ''.join(whitespaceset) + + # In addition to the delimiters, we also use '\', which + # is special in some contexts in PDF. + delimiter_pattern = '\\\\|\\' + '|\\'.join(delimiterset) + + # Dictionary delimiters are '<<' and '>>'. Look for + # these before the single variety. + dictdelim_pattern = r'\<\<|\>\>' + + pattern = '(%s|%s|%s)' % (whitespace_pattern, + dictdelim_pattern, delimiter_pattern) + re_func = re.compile(pattern).finditer + del whitespace_pattern, dictdelim_pattern + del delimiter_pattern, pattern + + def __init__(self, fdata): + + class MyIterator(object): + def next(): + if not tokens: + startloc = self.startloc + for match in next_match[0]: + start = match.start() + end = match.end() + tappend(fdata[start:end]) + if start > startloc: + tappend(fdata[startloc:start]) + self.startloc = end + break + else: + s = fdata[startloc:] + self.startloc = len(fdata) + if s: + tappend(s) + if not tokens: + raise StopIteration + return tpop() + next = staticmethod(next) + + self.fdata = fdata + self.tokens = tokens = [] + self.iterator = iterator = MyIterator() + self.next = iterator.next + self.next_match = next_match = [None] + tappend = tokens.append + tpop = tokens.pop + + def setstart(self, startloc): + self.startloc = startloc + self.next_match[0] = self.re_func(self.fdata, startloc) + + def __iter__(self): + return self.iterator + + def coalesce(self, result): + ''' This function coalesces tokens together up until + the next delimiter or whitespace. + All of the coalesced tokens will either be non-matches, + or will be a matched backslash. We distinguish the + non-matches by the fact that next() will have left + a following match inside self.tokens for the actual match. + ''' + tokens = self.tokens + whitespace = self.whitespaceset + + # Optimized path for usual case -- regular data (not a name string), + # with no escape character, and followed by whitespace. + + if tokens: + token = tokens.pop() + if token != '\\': + if token[0] not in whitespace: + tokens.append(token) + return + result.append(token) + + # Non-optimized path. Either start of a name string received, + # or we just had one escape. + + for token in self: + if tokens: + result.append(token) + token = tokens.pop() + if token != '\\': + if token[0] not in whitespace: + tokens.append(token) + return + result.append(token) + + + def floc(self): + return self.startloc - sum([len(x) for x in self.tokens]) + +class PdfTokens(object): + + def __init__(self, fdata, startloc=0, strip_comments=True): + + def comment(token): + tokens = [token] + for token in primitive: + tokens.append(token) + if token[0] in whitespaceset and ('\n' in token or '\r' in token): + break + return not strip_comments and ''.join(tokens) + + def single(token): + return token + + def regular_string(token): + def escaped(): + escaped = False + i = -2 + while tokens[i] == '\\': + escaped = not escaped + i -= 1 + return escaped + + tokens = [token] + nestlevel = 1 + for token in primitive: + tokens.append(token) + if token in '()' and not escaped(): + nestlevel += token == '(' or -1 + if not nestlevel: + break + else: + assert 0, "Unexpected end of token stream" + return PdfString(''.join(tokens)) + + def hex_string(token): + tokens = [token] + for token in primitive: + tokens.append(token) + if token == '>': + break + while tokens[-2] == '>>': + tokens.append(tokens.pop(-2)) + return PdfString(''.join(tokens)) + + def normal_data(token): + + # Obscure optimization -- we can get here with + # whitespace or regular character data. If we get + # here with whitespace, then there won't be an additional + # token queued up in the primitive object, otherwise there + # will... + if primitive_tokens: #if token[0] not in whitespaceset: + tokens = [token] + primitive.coalesce(tokens) + return PdfObject(''.join(tokens)) + + def name_string(token): + tokens = [token] + primitive.coalesce(tokens) + token = ''.join(tokens) + if '#' in token: + substrs = token.split('#') + substrs.reverse() + tokens = [substrs.pop()] + while substrs: + s = substrs.pop() + tokens.append(chr(int(s[:2], 16))) + tokens.append(s[2:]) + token = ''.join(tokens) + return PdfObject(token) + + def broken(token): + assert 0, token + + dispatch = { + '(': regular_string, + ')': broken, + '<': hex_string, + '>': broken, + '[': single, + ']': single, + '{': single, + '}': single, + '/': name_string, + '%' : comment, + '<<': single, + '>>': single, + }.get + + class MyIterator(object): + def next(): + while not tokens: + token = primitive_next() + token = dispatch(token, normal_data)(token) + if token: + return token + return tokens.pop() + next = staticmethod(next) + + self.primitive = primitive = _PrimitiveTokens(fdata) + self.setstart = primitive.setstart + primitive.setstart(startloc) + self.fdata = fdata + self.strip_comments = strip_comments + self.tokens = tokens = [] + self.iterator = iterator = MyIterator() + self.next = iterator.next + primitive_next = primitive.next + primitive_tokens = primitive.tokens + whitespaceset = _PrimitiveTokens.whitespaceset + + def floc(self): + return self.primitive.floc() - sum([len(x) for x in self.tokens]) + floc = property(floc) + + def __iter__(self): + return self.iterator + + def multiple(self, count): + next = self.next + return [next() for i in range(count)] diff --git a/lib/pdfrw/pdfwriter.py b/lib/pdfrw/pdfwriter.py new file mode 100755 index 0000000..c193843 --- /dev/null +++ b/lib/pdfrw/pdfwriter.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python + +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +The PdfWriter class writes an entire PDF file out to disk. + +The writing process is not at all optimized or organized. + +An instance of the PdfWriter class has two methods: + addpage(page) +and + write(fname) + +addpage() assumes that the pages are part of a valid +tree/forest of PDF objects. +''' + +try: + set +except NameError: + from sets import Set as set + +from pdfobjects import PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfString +from pdfcompress import compress + +debug = False + +class FormatObjects(object): + ''' FormatObjects performs the actual formatting and disk write. + ''' + + def add(self, obj, visited): + ''' Add an object to our list, if it's an indirect + object. Just format it if not. + ''' + # Can't hash dicts, so just hash the object ID + objid = id(obj) + + # Automatically set stream objects to indirect + if isinstance(obj, PdfDict): + indirect = obj.indirect or (obj.stream is not None) + else: + indirect = getattr(obj, 'indirect', False) + + if not indirect: + assert objid not in visited, \ + 'Circular reference encountered in non-indirect object %s' % repr(obj) + visited.add(objid) + result = self.format_obj(obj, visited) + visited.remove(objid) + return result + + objnum = self.indirect_dict.get(objid) + + # If we haven't seen the object yet, we need to + # add it to the indirect object list. + if objnum is None: + objlist = self.objlist + objnum = len(objlist) + 1 + if debug: + print ' Object', objnum, '\r', + objlist.append(None) + self.indirect_dict[objid] = objnum + objlist[objnum-1] = self.format_obj(obj) + return '%s 0 R' % objnum + + def format_array(myarray, formatter): + # Format array data into semi-readable ASCII + if sum([len(x) for x in myarray]) <= 70: + return formatter % ' '.join(myarray) + bigarray = [] + count = 1000000 + for x in myarray: + lenx = len(x) + if lenx + count > 70: + subarray = [] + bigarray.append(subarray) + count = 0 + count += lenx + 1 + subarray.append(x) + return formatter % '\n '.join([' '.join(x) for x in bigarray]) + format_array = staticmethod(format_array) + + def format_obj(self, obj, visited=None): + ''' format PDF object data into semi-readable ASCII. + May mutually recurse with add() -- add() will + return references for indirect objects, and add + the indirect object to the list. + ''' + if visited is None: + visited = set() + if isinstance(obj, PdfArray): + myarray = [self.add(x, visited) for x in obj] + return self.format_array(myarray, '[%s]') + elif isinstance(obj, PdfDict): + if self.compress and obj.stream: + compress([obj]) + myarray = [] + # Jython 2.2.1 has a bug which segfaults when + # sorting subclassed strings, so we un-subclass them. + dictkeys = [str(x) for x in obj.iterkeys()] + dictkeys.sort() + for key in dictkeys: + myarray.append(key) + myarray.append(self.add(obj[key], visited)) + result = self.format_array(myarray, '<<%s>>') + stream = obj.stream + if stream is not None: + result = '%s\nstream\n%s\nendstream' % (result, stream) + return result + elif isinstance(obj, basestring) and not hasattr(obj, 'indirect'): + return PdfString.encode(obj) + else: + return str(obj) + + def dump(cls, f, trailer, version='1.3', compress=True): + self = cls() + self.compress = compress + self.indirect_dict = {} + self.objlist = [] + + # The first format of trailer gets all the information, + # but we throw away the actual trailer formatting. + self.format_obj(trailer) + # Now we know the size, so we update the trailer dict + # and get the formatted data. + trailer.Size = PdfObject(len(self.objlist) + 1) + trailer = self.format_obj(trailer) + + # Now we have all the pieces to write out to the file. + # Keep careful track of the counts while we do it so + # we can correctly build the cross-reference. + + header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version + f.write(header) + offset = len(header) + offsets = [(0, 65535, 'f')] + + for i, x in enumerate(self.objlist): + objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x) + offsets.append((offset, 0, 'n')) + offset += len(objstr) + f.write(objstr) + + f.write('xref\n0 %s\n' % len(offsets)) + for x in offsets: + f.write('%010d %05d %s\r\n' % x) + f.write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset)) + dump = classmethod(dump) + +class PdfWriter(object): + + _trailer = None + + def __init__(self, version='1.3', compress=True): + self.pagearray = PdfArray() + self.compress = compress + self.version = version + + def addpage(self, page): + self._trailer = None + assert page.Type == PdfName.Page + inheritable = page.inheritable # searches for resources + self.pagearray.append( + IndirectPdfDict( + page, + Resources = inheritable.Resources, + MediaBox = inheritable.MediaBox, + CropBox = inheritable.CropBox, + Rotate = inheritable.Rotate, + ) + ) + return self + + addPage = addpage # for compatibility with pyPdf + + def addpages(self, pagelist): + for page in pagelist: + self.addpage(page) + return self + + def _get_trailer(self): + trailer = self._trailer + if trailer is not None: + return trailer + + # Create the basic object structure of the PDF file + trailer = PdfDict( + Root = IndirectPdfDict( + Type = PdfName.Catalog, + Pages = IndirectPdfDict( + Type = PdfName.Pages, + Count = PdfObject(len(self.pagearray)), + Kids = self.pagearray + ) + ) + ) + # Make all the pages point back to the page dictionary + pagedict = trailer.Root.Pages + for page in pagedict.Kids: + page.Parent = pagedict + self._trailer = trailer + return trailer + + def _set_trailer(self, trailer): + self._trailer = trailer + + trailer = property(_get_trailer, _set_trailer) + + def write(self, fname, trailer=None): + trailer = trailer or self.trailer + + # Dump the data. We either have a filename or a preexisting + # file object. + preexisting = hasattr(fname, 'write') + f = preexisting and fname or open(fname, 'wb') + FormatObjects.dump(f, trailer, self.version, self.compress) + if not preexisting: + f.close() + +if __name__ == '__main__': + debug = True + import pdfreader + x = pdfreader.PdfReader('source.pdf') + y = PdfWriter() + for i, page in enumerate(x.pages): + print ' Adding page', i+1, '\r', + y.addpage(page) + print + y.write('result.pdf') + print diff --git a/lib/pdfrw/toreportlab.py b/lib/pdfrw/toreportlab.py new file mode 100644 index 0000000..00ad324 --- /dev/null +++ b/lib/pdfrw/toreportlab.py @@ -0,0 +1,139 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +Converts pdfrw objects into reportlab objects. + +Designed for and tested with rl 2.3. + +Knows too much about reportlab internals. +What can you do? + +The interface to this function is through the makerl() function. + +Parameters: + canv - a reportlab "canvas" (also accepts a "document") + pdfobj - a pdfrw PDF object + +Returns: + A corresponding reportlab object, or if the + object is a PDF Form XObject, the name to + use with reportlab for the object. + + Will recursively convert all necessary objects. + Be careful when converting a page -- if /Parent is set, + will recursively convert all pages! + +Notes: + 1) Original objects are annotated with a + derived_rl_obj attribute which points to the + reportlab object. This keeps multiple reportlab + objects from being generated for the same pdfobj + via repeated calls to makerl. This is great for + not putting too many objects into the + new PDF, but not so good if you are modifying + objects for different pages. Then you + need to do your own deep copying (of circular + structures). You're on your own. + + 2) ReportLab seems weird about FormXObjects. + They pass around a partial name instead of the + object or a reference to it. So we have to + reach into reportlab and get a number for + a unique name. I guess this is to make it + where you can combine page streams with + impunity, but that's just a guess. + + 3) Updated 1/23/2010 to handle multipass documents + (e.g. with a table of contents). These have + a different doc object on every pass. + +''' + +from reportlab.pdfbase import pdfdoc as rldocmodule +from pdfobjects import PdfDict, PdfArray, PdfName + +RLStream = rldocmodule.PDFStream +RLDict = rldocmodule.PDFDictionary +RLArray = rldocmodule.PDFArray + + +def _makedict(rldoc, pdfobj): + rlobj = rldict = RLDict() + if pdfobj.indirect: + rlobj.__RefOnly__ = 1 + rlobj = rldoc.Reference(rlobj) + pdfobj.derived_rl_obj[rldoc] = rlobj, None + + for key, value in pdfobj.iteritems(): + rldict[key[1:]] = makerl_recurse(rldoc, value) + + return rlobj + +def _makestream(rldoc, pdfobj, xobjtype=PdfName.XObject): + rldict = RLDict() + rlobj = RLStream(rldict, pdfobj.stream) + + if pdfobj.Type == xobjtype: + shortname = 'pdfrw_%s' % (rldoc.objectcounter+1) + fullname = rldoc.getXObjectName(shortname) + else: + shortname = fullname = None + result = rldoc.Reference(rlobj, fullname) + pdfobj.derived_rl_obj[rldoc] = result, shortname + + for key, value in pdfobj.iteritems(): + rldict[key[1:]] = makerl_recurse(rldoc, value) + + return result + +def _makearray(rldoc, pdfobj): + rlobj = rlarray = RLArray([]) + if pdfobj.indirect: + rlobj.__RefOnly__ = 1 + rlobj = rldoc.Reference(rlobj) + pdfobj.derived_rl_obj[rldoc] = rlobj, None + + mylist = rlarray.sequence + for value in pdfobj: + mylist.append(makerl_recurse(rldoc, value)) + + return rlobj + +def _makestr(rldoc, pdfobj): + assert isinstance(pdfobj, (float, int, str)), repr(pdfobj) + return pdfobj + +def makerl_recurse(rldoc, pdfobj): + docdict = getattr(pdfobj, 'derived_rl_obj', None) + if docdict is not None: + value = docdict.get(rldoc) + if value is not None: + return value[0] + if isinstance(pdfobj, PdfDict): + if pdfobj.stream is not None: + func = _makestream + else: + func = _makedict + if docdict is None: + pdfobj.private.derived_rl_obj = {} + elif isinstance(pdfobj, PdfArray): + func = _makearray + if docdict is None: + pdfobj.derived_rl_obj = {} + else: + func = _makestr + return func(rldoc, pdfobj) + +def makerl(canv, pdfobj): + try: + rldoc = canv._doc + except AttributeError: + rldoc = canv + rlobj = makerl_recurse(rldoc, pdfobj) + try: + name = pdfobj.derived_rl_obj[rldoc][1] + except AttributeError: + name = None + return name or rlobj diff --git a/lib/sounds.py b/lib/sounds.py new file mode 100644 index 0000000..a4bf5b6 --- /dev/null +++ b/lib/sounds.py @@ -0,0 +1 @@ +import parser -- cgit v1.3