diff options
| author | jvoisin | 2011-06-21 20:41:18 +0200 |
|---|---|---|
| committer | jvoisin | 2011-06-21 20:41:18 +0200 |
| commit | 9e69adbe1b065707f8be4f146cc3c05660cef711 (patch) | |
| tree | d60509a4982d7699204059184c4343352fef52de /lib | |
| parent | f0c9c5b56e3909ba36cc84ff82b05fab9a180911 (diff) | |
Add pdfrw, and many files that I have forgetten, sorry !
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/archive.py | 6 | ||||
| -rw-r--r-- | lib/audio.py | 8 | ||||
| -rw-r--r-- | lib/mat.py | 2 | ||||
| -rw-r--r-- | lib/misc.py | 44 | ||||
| -rw-r--r-- | lib/pdfrw/__init__.py | 13 | ||||
| -rw-r--r-- | lib/pdfrw/buildxobj.py | 191 | ||||
| -rw-r--r-- | lib/pdfrw/pdfcompress.py | 57 | ||||
| -rw-r--r-- | lib/pdfrw/pdfobjects.py | 183 | ||||
| -rw-r--r-- | lib/pdfrw/pdfreader.py | 213 | ||||
| -rw-r--r-- | lib/pdfrw/pdftokens.py | 249 | ||||
| -rwxr-xr-x | lib/pdfrw/pdfwriter.py | 234 | ||||
| -rw-r--r-- | lib/pdfrw/toreportlab.py | 139 | ||||
| -rw-r--r-- | lib/sounds.py | 1 |
13 files changed, 1340 insertions, 0 deletions
diff --git a/lib/archive.py b/lib/archive.py new file mode 100644 index 0000000..6378cab --- /dev/null +++ b/lib/archive.py | |||
| @@ -0,0 +1,6 @@ | |||
| 1 | import parser | ||
| 2 | |||
| 3 | class TarStripper(parser.Generic_parser): | ||
| 4 | def remove_all(self): | ||
| 5 | for file in self.editor.array("file"): | ||
| 6 | print file.name | ||
diff --git a/lib/audio.py b/lib/audio.py new file mode 100644 index 0000000..6d653bc --- /dev/null +++ b/lib/audio.py | |||
| @@ -0,0 +1,8 @@ | |||
| 1 | import parser | ||
| 2 | |||
| 3 | class MpegAudioStripper(parser.Generic_parser): | ||
| 4 | def _should_remove(self, field): | ||
| 5 | if field.name in ("id3v1", "id3v2"): | ||
| 6 | return True | ||
| 7 | else: | ||
| 8 | return False | ||
| @@ -14,6 +14,7 @@ import hachoir_editor | |||
| 14 | import images | 14 | import images |
| 15 | import audio | 15 | import audio |
| 16 | import misc | 16 | import misc |
| 17 | import archive | ||
| 17 | 18 | ||
| 18 | __version__ = "0.1" | 19 | __version__ = "0.1" |
| 19 | __author__ = "jvoisin" | 20 | __author__ = "jvoisin" |
| @@ -23,6 +24,7 @@ strippers = { | |||
| 23 | hachoir_parser.image.PngFile: images.PngStripper, | 24 | hachoir_parser.image.PngFile: images.PngStripper, |
| 24 | hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, | 25 | hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, |
| 25 | hachoir_parser.misc.PDFDocument: misc.PdfStripper, | 26 | hachoir_parser.misc.PDFDocument: misc.PdfStripper, |
| 27 | hachoir_parser.archive.TarFile: archive.TarStripper, | ||
| 26 | } | 28 | } |
| 27 | 29 | ||
| 28 | def create_class_file(name): | 30 | def create_class_file(name): |
diff --git a/lib/misc.py b/lib/misc.py new file mode 100644 index 0000000..56c2274 --- /dev/null +++ b/lib/misc.py | |||
| @@ -0,0 +1,44 @@ | |||
| 1 | import parser | ||
| 2 | import pdfrw | ||
| 3 | |||
| 4 | class PdfStripper(parser.Generic_parser): | ||
| 5 | ''' | ||
| 6 | Represent a pdf file, with the help of pdfrw | ||
| 7 | ''' | ||
| 8 | def __init__(self, filename): | ||
| 9 | self.filename = filename | ||
| 10 | self.trailer = pdfrw.PdfReader(self.filename) | ||
| 11 | self.writer = pdfrw.PdfWriter() | ||
| 12 | |||
| 13 | def remove_all(self): | ||
| 14 | ''' | ||
| 15 | Remove all the files that are compromizing | ||
| 16 | ''' | ||
| 17 | self.trailer.Info.Title = '' | ||
| 18 | self.trailer.Info.Author = '' | ||
| 19 | self.trailer.Info.Producer = '' | ||
| 20 | self.trailer.Info.Creator = '' | ||
| 21 | self.trailer.Info.CreationDate = '' | ||
| 22 | self.trailer.Info.ModDate = '' | ||
| 23 | |||
| 24 | self.writer.trailer = self.trailer | ||
| 25 | self.writer.write(self.filename + parser.POSTFIX) | ||
| 26 | |||
| 27 | def is_clean(self): | ||
| 28 | ''' | ||
| 29 | Check if the file is clean from harmful metadatas | ||
| 30 | ''' | ||
| 31 | for field in self.trailer.Info: | ||
| 32 | if field != '': | ||
| 33 | return False | ||
| 34 | return True | ||
| 35 | |||
| 36 | def get_meta(self): | ||
| 37 | ''' | ||
| 38 | return a dict with all the meta of the file | ||
| 39 | ''' | ||
| 40 | metadata = {} | ||
| 41 | for key, value in self.trailer.Info.iteritems(): | ||
| 42 | metadata[key[1:]] = value[1:-1] | ||
| 43 | return metadata | ||
| 44 | |||
diff --git a/lib/pdfrw/__init__.py b/lib/pdfrw/__init__.py new file mode 100644 index 0000000..964972f --- /dev/null +++ b/lib/pdfrw/__init__.py | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | from pdfwriter import PdfWriter | ||
| 6 | from pdfreader import PdfReader | ||
| 7 | from pdfobjects import PdfObject, PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfString | ||
| 8 | from pdftokens import PdfTokens | ||
| 9 | |||
| 10 | # Add a tiny bit of compatibility to pyPdf | ||
| 11 | |||
| 12 | PdfFileReader = PdfReader | ||
| 13 | PdfFileWriter = PdfWriter | ||
diff --git a/lib/pdfrw/buildxobj.py b/lib/pdfrw/buildxobj.py new file mode 100644 index 0000000..203dd8c --- /dev/null +++ b/lib/pdfrw/buildxobj.py | |||
| @@ -0,0 +1,191 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | |||
| 7 | This module contains code to build PDF "Form XObjects". | ||
| 8 | |||
| 9 | A Form XObject allows a fragment from one PDF file to be cleanly | ||
| 10 | included in another PDF file. | ||
| 11 | |||
| 12 | Reference for syntax: "Parameters for opening PDF files" from SDK 8.1 | ||
| 13 | |||
| 14 | http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf | ||
| 15 | |||
| 16 | supported 'page=xxx', 'viewrect=<left>,<top>,<width>,<height>' | ||
| 17 | |||
| 18 | Units are in points | ||
| 19 | |||
| 20 | Reference for content: Adobe PDF reference, sixth edition, version 1.7 | ||
| 21 | |||
| 22 | http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf | ||
| 23 | |||
| 24 | Form xobjects discussed chapter 4.9, page 355 | ||
| 25 | ''' | ||
| 26 | |||
| 27 | from pdfobjects import PdfDict, PdfArray, PdfName | ||
| 28 | from pdfreader import PdfReader | ||
| 29 | |||
| 30 | class ViewInfo(object): | ||
| 31 | ''' Instantiate ViewInfo with a uri, and it will parse out | ||
| 32 | the filename, page, and viewrect into object attributes. | ||
| 33 | ''' | ||
| 34 | doc = None | ||
| 35 | docname = None | ||
| 36 | page = None | ||
| 37 | viewrect = None | ||
| 38 | |||
| 39 | def __init__(self, pageinfo='', **kw): | ||
| 40 | pageinfo=pageinfo.split('#',1) | ||
| 41 | if len(pageinfo) == 2: | ||
| 42 | pageinfo[1:] = pageinfo[1].replace('&', '#').split('#') | ||
| 43 | for key in 'page viewrect'.split(): | ||
| 44 | if pageinfo[0].startswith(key+'='): | ||
| 45 | break | ||
| 46 | else: | ||
| 47 | self.docname = pageinfo.pop(0) | ||
| 48 | for item in pageinfo: | ||
| 49 | key, value = item.split('=') | ||
| 50 | key = key.strip() | ||
| 51 | value = value.replace(',', ' ').split() | ||
| 52 | if key == 'page': | ||
| 53 | assert len(value) == 1 | ||
| 54 | setattr(self, key, int(value[0])) | ||
| 55 | elif key == 'viewrect': | ||
| 56 | assert len(value) == 4 | ||
| 57 | setattr(self, key, [float(x) for x in value]) | ||
| 58 | else: | ||
| 59 | log.error('Unknown option: %s', key) | ||
| 60 | for key, value in kw.iteritems(): | ||
| 61 | assert hasattr(self, key), key | ||
| 62 | setattr(self, key, value) | ||
| 63 | |||
| 64 | def getrects(inheritable, pageinfo): | ||
| 65 | ''' Given the inheritable attributes of a page and | ||
| 66 | the desired pageinfo rectangle, return the page's | ||
| 67 | media box and the calculated boundary (clip) box. | ||
| 68 | ''' | ||
| 69 | mbox = tuple([float(x) for x in inheritable.MediaBox]) | ||
| 70 | vrect = pageinfo.viewrect | ||
| 71 | if vrect is None: | ||
| 72 | cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)]) | ||
| 73 | else: | ||
| 74 | mleft, mbot, mright, mtop = mbox | ||
| 75 | x, y, w, h = vrect | ||
| 76 | cleft = mleft + x | ||
| 77 | ctop = mtop - y | ||
| 78 | cright = cleft + w | ||
| 79 | cbot = ctop - h | ||
| 80 | cbox = max(mleft, cleft), max(mbot, cbot), min(mright, cright), min(mtop, ctop) | ||
| 81 | return mbox, cbox | ||
| 82 | |||
| 83 | def _cache_xobj(contents, resources, mbox, bbox): | ||
| 84 | ''' Return a cached Form XObject, or create a new one and cache it. | ||
| 85 | ''' | ||
| 86 | cachedict = contents.xobj_cachedict | ||
| 87 | if cachedict is None: | ||
| 88 | cachedict = contents.private.xobj_cachedict = {} | ||
| 89 | result = cachedict.get(bbox) | ||
| 90 | if result is None: | ||
| 91 | func = (_get_fullpage, _get_subpage)[mbox != bbox] | ||
| 92 | result = PdfDict( | ||
| 93 | func(contents, resources, mbox, bbox), | ||
| 94 | Type = PdfName.XObject, | ||
| 95 | Subtype = PdfName.Form, | ||
| 96 | FormType = 1, | ||
| 97 | BBox = PdfArray(bbox), | ||
| 98 | ) | ||
| 99 | cachedict[bbox] = result | ||
| 100 | return result | ||
| 101 | |||
| 102 | def _get_fullpage(contents, resources, mbox, bbox): | ||
| 103 | ''' fullpage is easy. Just copy the contents, | ||
| 104 | set up the resources, and let _cache_xobj handle the | ||
| 105 | rest. | ||
| 106 | ''' | ||
| 107 | return PdfDict(contents, Resources=resources) | ||
| 108 | |||
| 109 | def _get_subpage(contents, resources, mbox, bbox): | ||
| 110 | ''' subpages *could* be as easy as full pages, but we | ||
| 111 | choose to complicate life by creating a Form XObject | ||
| 112 | for the page, and then one that references it for | ||
| 113 | the subpage, on the off-chance that we want multiple | ||
| 114 | items from the page. | ||
| 115 | ''' | ||
| 116 | return PdfDict( | ||
| 117 | stream = '/FullPage Do\n', | ||
| 118 | Resources = PdfDict( | ||
| 119 | XObject = PdfDict( | ||
| 120 | FullPage = _cache_xobj(contents, resources, mbox, mbox) | ||
| 121 | ) | ||
| 122 | ) | ||
| 123 | ) | ||
| 124 | |||
| 125 | def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): | ||
| 126 | ''' pagexobj creates and returns a Form XObject for | ||
| 127 | a given view within a page (Defaults to entire page.) | ||
| 128 | ''' | ||
| 129 | inheritable = page.inheritable | ||
| 130 | resources = inheritable.Resources | ||
| 131 | mbox, bbox = getrects(inheritable, viewinfo) | ||
| 132 | contents = page.Contents | ||
| 133 | # Make sure the only attribute is length | ||
| 134 | # All the filters must have been executed | ||
| 135 | assert int(contents.Length) == len(contents.stream) | ||
| 136 | if not allow_compressed: | ||
| 137 | assert len([x for x in contents.iteritems()]) == 1 | ||
| 138 | |||
| 139 | return _cache_xobj(contents, resources, mbox, bbox) | ||
| 140 | |||
| 141 | |||
| 142 | def docxobj(pageinfo, doc=None, allow_compressed=True): | ||
| 143 | ''' docxobj creates and returns an actual Form XObject. | ||
| 144 | Can work standalone, or in conjunction with | ||
| 145 | the CacheXObj class (below). | ||
| 146 | ''' | ||
| 147 | if not isinstance(pageinfo, ViewInfo): | ||
| 148 | pageinfo = ViewInfo(pageinfo) | ||
| 149 | |||
| 150 | # If we're explicitly passed a document, | ||
| 151 | # make sure we don't have one implicitly as well. | ||
| 152 | # If no implicit or explicit doc, then read one in | ||
| 153 | # from the filename. | ||
| 154 | if doc is not None: | ||
| 155 | assert pageinfo.doc is None | ||
| 156 | pageinfo.doc = doc | ||
| 157 | elif pageinfo.doc is not None: | ||
| 158 | doc = pageinfo.doc | ||
| 159 | else: | ||
| 160 | doc = pageinfo.doc = PdfReader(pageinfo.docname, decompress = not allow_compressed) | ||
| 161 | assert isinstance(doc, PdfReader) | ||
| 162 | |||
| 163 | sourcepage = doc.pages[(pageinfo.page or 1) - 1] | ||
| 164 | return pagexobj(sourcepage, pageinfo, allow_compressed) | ||
| 165 | |||
| 166 | |||
| 167 | class CacheXObj(object): | ||
| 168 | ''' Use to keep from reparsing files over and over, | ||
| 169 | and to keep from making the output too much | ||
| 170 | bigger than it ought to be by replicating | ||
| 171 | unnecessary object copies. | ||
| 172 | ''' | ||
| 173 | def __init__(self, decompress=False): | ||
| 174 | ''' Set decompress true if you need | ||
| 175 | the Form XObjects to be decompressed. | ||
| 176 | Will decompress what it can and scream | ||
| 177 | about the rest. | ||
| 178 | ''' | ||
| 179 | self.cached_pdfs = {} | ||
| 180 | self.decompress = decompress | ||
| 181 | |||
| 182 | def load(self, sourcename): | ||
| 183 | ''' Load a Form XObject from a uri | ||
| 184 | ''' | ||
| 185 | info = ViewInfo(sourcename) | ||
| 186 | fname = info.docname | ||
| 187 | pcache = self.cached_pdfs | ||
| 188 | doc = pcache.get(fname) | ||
| 189 | if doc is None: | ||
| 190 | doc = pcache[fname] = PdfReader(fname, decompress=self.decompress) | ||
| 191 | return docxobj(info, doc, allow_compressed=not self.decompress) | ||
diff --git a/lib/pdfrw/pdfcompress.py b/lib/pdfrw/pdfcompress.py new file mode 100644 index 0000000..1c11970 --- /dev/null +++ b/lib/pdfrw/pdfcompress.py | |||
| @@ -0,0 +1,57 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | Currently, this sad little file only knows how to decompress | ||
| 7 | using the flate (zlib) algorithm. Maybe more later, but it's | ||
| 8 | not a priority for me... | ||
| 9 | ''' | ||
| 10 | |||
| 11 | from __future__ import generators | ||
| 12 | |||
| 13 | try: | ||
| 14 | set | ||
| 15 | except NameError: | ||
| 16 | from sets import Set as set | ||
| 17 | |||
| 18 | import zlib | ||
| 19 | from pdfobjects import PdfDict, PdfName | ||
| 20 | |||
| 21 | |||
| 22 | def streamobjects(mylist): | ||
| 23 | for obj in mylist: | ||
| 24 | if isinstance(obj, PdfDict) and obj.stream is not None: | ||
| 25 | yield obj | ||
| 26 | |||
| 27 | def uncompress(mylist, warnings=set()): | ||
| 28 | flate = PdfName.FlateDecode | ||
| 29 | for obj in streamobjects(mylist): | ||
| 30 | ftype = obj.Filter | ||
| 31 | if ftype is None: | ||
| 32 | continue | ||
| 33 | if isinstance(ftype, list) and len(ftype) == 1: | ||
| 34 | # todo: multiple filters | ||
| 35 | ftype = ftype[0] | ||
| 36 | parms = obj.DecodeParms | ||
| 37 | if ftype != flate or parms is not None: | ||
| 38 | msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms)) | ||
| 39 | if msg not in warnings: | ||
| 40 | warnings.add(msg) | ||
| 41 | print msg | ||
| 42 | else: | ||
| 43 | obj.stream = zlib.decompress(obj.stream) | ||
| 44 | obj.Filter = None | ||
| 45 | |||
| 46 | def compress(mylist): | ||
| 47 | flate = PdfName.FlateDecode | ||
| 48 | for obj in streamobjects(mylist): | ||
| 49 | ftype = obj.Filter | ||
| 50 | if ftype is not None: | ||
| 51 | continue | ||
| 52 | oldstr = obj.stream | ||
| 53 | newstr = zlib.compress(oldstr) | ||
| 54 | if len(newstr) < len(oldstr) + 30: | ||
| 55 | obj.stream = newstr | ||
| 56 | obj.Filter = flate | ||
| 57 | obj.DecodeParms = None | ||
diff --git a/lib/pdfrw/pdfobjects.py b/lib/pdfrw/pdfobjects.py new file mode 100644 index 0000000..08ad825 --- /dev/null +++ b/lib/pdfrw/pdfobjects.py | |||
| @@ -0,0 +1,183 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | Objects that can occur in PDF files. The most important | ||
| 7 | objects are arrays and dicts. Either of these can be | ||
| 8 | indirect or not, and dicts could have an associated | ||
| 9 | stream. | ||
| 10 | ''' | ||
| 11 | from __future__ import generators | ||
| 12 | |||
| 13 | try: | ||
| 14 | set | ||
| 15 | except NameError: | ||
| 16 | from sets import Set as set | ||
| 17 | |||
| 18 | import re | ||
| 19 | |||
| 20 | class PdfObject(str): | ||
| 21 | indirect = False | ||
| 22 | |||
| 23 | class PdfArray(list): | ||
| 24 | indirect = False | ||
| 25 | |||
| 26 | class PdfName(object): | ||
| 27 | def __getattr__(self, name): | ||
| 28 | return self(name) | ||
| 29 | def __call__(self, name): | ||
| 30 | return PdfObject('/' + name) | ||
| 31 | |||
| 32 | PdfName = PdfName() | ||
| 33 | |||
| 34 | class PdfString(str): | ||
| 35 | indirect = False | ||
| 36 | unescape_dict = {'\\b':'\b', '\\f':'\f', '\\n':'\n', | ||
| 37 | '\\r':'\r', '\\t':'\t', | ||
| 38 | '\\\r\n': '', '\\\r':'', '\\\n':'', | ||
| 39 | '\\\\':'\\', '\\':'', | ||
| 40 | } | ||
| 41 | unescape_pattern = r'(\\b|\\f|\\n|\\r|\\t|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)' | ||
| 42 | unescape_func = re.compile(unescape_pattern).split | ||
| 43 | |||
| 44 | hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])' | ||
| 45 | hex_func = re.compile(hex_pattern).split | ||
| 46 | |||
| 47 | hex_pattern2 = '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])' | ||
| 48 | hex_func2 = re.compile(hex_pattern2).split | ||
| 49 | |||
| 50 | hex_funcs = hex_func, hex_func2 | ||
| 51 | |||
| 52 | indirect = False | ||
| 53 | |||
| 54 | def decode_regular(self, remap=chr): | ||
| 55 | assert self[0] == '(' and self[-1] == ')' | ||
| 56 | mylist = self.unescape_func(self[1:-1]) | ||
| 57 | result = [] | ||
| 58 | unescape = self.unescape_dict.get | ||
| 59 | for chunk in mylist: | ||
| 60 | chunk = unescape(chunk, chunk) | ||
| 61 | if chunk.startswith('\\') and len(chunk) > 1: | ||
| 62 | value = int(chunk[1:], 8) | ||
| 63 | # FIXME: TODO: Handle unicode here | ||
| 64 | if value > 127: | ||
| 65 | value = 127 | ||
| 66 | chunk = remap(value) | ||
| 67 | if chunk: | ||
| 68 | result.append(chunk) | ||
| 69 | return ''.join(result) | ||
| 70 | |||
| 71 | def decode_hex(self, remap=chr, twobytes=False): | ||
| 72 | data = ''.join(self.split()) | ||
| 73 | data = self.hex_funcs[twobytes](data) | ||
| 74 | chars = data[1::2] | ||
| 75 | other = data[0::2] | ||
| 76 | assert other[0] == '<' and other[-1] == '>' and ''.join(other) == '<>', self | ||
| 77 | return ''.join([remap(int(x, 16)) for x in chars]) | ||
| 78 | |||
| 79 | def decode(self, remap=chr, twobytes=False): | ||
| 80 | if self.startswith('('): | ||
| 81 | return self.decode_regular(remap) | ||
| 82 | |||
| 83 | else: | ||
| 84 | return self.decode_hex(remap, twobytes) | ||
| 85 | |||
| 86 | def encode(cls, source, usehex=False): | ||
| 87 | assert not usehex, "Not supported yet" | ||
| 88 | if isinstance(source, unicode): | ||
| 89 | source = source.encode('utf-8') | ||
| 90 | else: | ||
| 91 | source = str(source) | ||
| 92 | source = source.replace('\\', '\\\\') | ||
| 93 | source = source.replace('(', '\\(') | ||
| 94 | source = source.replace(')', '\\)') | ||
| 95 | return cls('(' +source + ')') | ||
| 96 | encode = classmethod(encode) | ||
| 97 | |||
| 98 | class PdfDict(dict): | ||
| 99 | indirect = False | ||
| 100 | stream = None | ||
| 101 | |||
| 102 | _special = dict(indirect = ('indirect', False), | ||
| 103 | stream = ('stream', True), | ||
| 104 | _stream = ('stream', False), | ||
| 105 | ) | ||
| 106 | |||
| 107 | def __setitem__(self, name, value): | ||
| 108 | assert name.startswith('/'), name | ||
| 109 | if value is not None: | ||
| 110 | dict.__setitem__(self, name, value) | ||
| 111 | elif name in self: | ||
| 112 | del self[name] | ||
| 113 | |||
| 114 | def __init__(self, *args, **kw): | ||
| 115 | if args: | ||
| 116 | if len(args) == 1: | ||
| 117 | args = args[0] | ||
| 118 | self.update(args) | ||
| 119 | if isinstance(args, PdfDict): | ||
| 120 | self.indirect = args.indirect | ||
| 121 | self._stream = args.stream | ||
| 122 | for key, value in kw.iteritems(): | ||
| 123 | setattr(self, key, value) | ||
| 124 | |||
| 125 | def __getattr__(self, name): | ||
| 126 | return self.get(PdfName(name)) | ||
| 127 | |||
| 128 | def __setattr__(self, name, value): | ||
| 129 | info = self._special.get(name) | ||
| 130 | if info is None: | ||
| 131 | self[PdfName(name)] = value | ||
| 132 | else: | ||
| 133 | name, setlen = info | ||
| 134 | self.__dict__[name] = value | ||
| 135 | if setlen: | ||
| 136 | notnone = value is not None | ||
| 137 | self.Length = notnone and PdfObject(len(value)) or None | ||
| 138 | |||
| 139 | def iteritems(self): | ||
| 140 | for key, value in dict.iteritems(self): | ||
| 141 | if value is not None: | ||
| 142 | assert key.startswith('/'), (key, value) | ||
| 143 | yield key, value | ||
| 144 | |||
| 145 | def inheritable(self): | ||
| 146 | ''' Search through ancestors as needed for inheritable | ||
| 147 | dictionary items | ||
| 148 | ''' | ||
| 149 | class Search(object): | ||
| 150 | def __init__(self, basedict): | ||
| 151 | self.basedict = basedict | ||
| 152 | def __getattr__(self, name): | ||
| 153 | return self[name] | ||
| 154 | def __getitem__(self, name): | ||
| 155 | visited = set() | ||
| 156 | mydict = self.basedict | ||
| 157 | while 1: | ||
| 158 | value = getattr(mydict, name) | ||
| 159 | if value is not None: | ||
| 160 | return value | ||
| 161 | myid = id(mydict) | ||
| 162 | assert myid not in visited | ||
| 163 | visited.add(myid) | ||
| 164 | mydict = mydict.Parent | ||
| 165 | if mydict is None: | ||
| 166 | return | ||
| 167 | return Search(self) | ||
| 168 | inheritable = property(inheritable) | ||
| 169 | |||
| 170 | def private(self): | ||
| 171 | ''' Allows setting private metadata for use in | ||
| 172 | processing (not sent to PDF file) | ||
| 173 | ''' | ||
| 174 | class Private(object): | ||
| 175 | pass | ||
| 176 | |||
| 177 | result = Private() | ||
| 178 | result.__dict__ = self.__dict__ | ||
| 179 | return result | ||
| 180 | private = property(private) | ||
| 181 | |||
| 182 | class IndirectPdfDict(PdfDict): | ||
| 183 | indirect = True | ||
diff --git a/lib/pdfrw/pdfreader.py b/lib/pdfrw/pdfreader.py new file mode 100644 index 0000000..6f57bea --- /dev/null +++ b/lib/pdfrw/pdfreader.py | |||
| @@ -0,0 +1,213 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | The PdfReader class reads an entire PDF file into memory and | ||
| 7 | parses the top-level container objects. (It does not parse | ||
| 8 | into streams.) The object subclasses PdfDict, and the | ||
| 9 | document pages are stored in a list in the pages attribute | ||
| 10 | of the object. | ||
| 11 | ''' | ||
| 12 | |||
| 13 | from pdftokens import PdfTokens | ||
| 14 | from pdfobjects import PdfDict, PdfArray, PdfName | ||
| 15 | from pdfcompress import uncompress | ||
| 16 | |||
| 17 | class PdfReader(PdfDict): | ||
| 18 | |||
| 19 | class unresolved: | ||
| 20 | # Used as a placeholder until we have an object. | ||
| 21 | pass | ||
| 22 | |||
| 23 | def readindirect(self, objnum, gennum): | ||
| 24 | ''' Read an indirect object. If it has already | ||
| 25 | been read, return it from the cache. | ||
| 26 | ''' | ||
| 27 | |||
| 28 | def setobj(obj): | ||
| 29 | # Store the new object in the dictionary | ||
| 30 | # once we have its value | ||
| 31 | record[1] = obj | ||
| 32 | |||
| 33 | def ordinary(source, setobj, obj): | ||
| 34 | # Deal with an ordinary (non-array, non-dict) object | ||
| 35 | setobj(obj) | ||
| 36 | return obj | ||
| 37 | |||
| 38 | fdata, objnum, gennum = self.fdata, int(objnum), int(gennum) | ||
| 39 | record = self.indirect_objects[fdata, objnum, gennum] | ||
| 40 | if record[1] is not self.unresolved: | ||
| 41 | return record[1] | ||
| 42 | |||
| 43 | # Read the object header and validate it | ||
| 44 | source = PdfTokens(fdata, record[0]) | ||
| 45 | objid = source.multiple(3) | ||
| 46 | assert int(objid[0]) == objnum, objid | ||
| 47 | assert int(objid[1]) == gennum, objid | ||
| 48 | assert objid[2] == 'obj', objid | ||
| 49 | |||
| 50 | # Read the object, and call special code if it starts | ||
| 51 | # an array or dictionary | ||
| 52 | obj = source.next() | ||
| 53 | obj = self.special.get(obj, ordinary)(source, setobj, obj) | ||
| 54 | self.readstream(obj, source) | ||
| 55 | obj.indirect = True | ||
| 56 | return obj | ||
| 57 | |||
| 58 | def readstream(obj, source): | ||
| 59 | ''' Read optional stream following a dictionary | ||
| 60 | object. | ||
| 61 | ''' | ||
| 62 | tok = source.next() | ||
| 63 | if tok == 'endobj': | ||
| 64 | return # No stream | ||
| 65 | |||
| 66 | assert isinstance(obj, PdfDict) | ||
| 67 | assert tok == 'stream', tok | ||
| 68 | fdata = source.fdata | ||
| 69 | floc = fdata.rindex(tok, 0, source.floc) + len(tok) | ||
| 70 | ch = fdata[floc] | ||
| 71 | if ch == '\r': | ||
| 72 | floc += 1 | ||
| 73 | ch = fdata[floc] | ||
| 74 | assert ch == '\n' | ||
| 75 | startstream = floc + 1 | ||
| 76 | endstream = startstream + int(obj.Length) | ||
| 77 | obj._stream = fdata[startstream:endstream] | ||
| 78 | source = PdfTokens(fdata, endstream) | ||
| 79 | endit = source.multiple(2) | ||
| 80 | if endit != 'endstream endobj'.split(): | ||
| 81 | # /Length attribute is broken, try to read stream | ||
| 82 | # anyway disregarding the specified value | ||
| 83 | # TODO: issue warning here once we have some kind of | ||
| 84 | # logging | ||
| 85 | endstream = fdata.index('endstream', startstream) | ||
| 86 | if fdata[endstream-2:endstream] == '\r\n': | ||
| 87 | endstream -= 2 | ||
| 88 | elif fdata[endstream-1] in ['\n', '\r']: | ||
| 89 | endstream -= 1 | ||
| 90 | source = PdfTokens(fdata, endstream) | ||
| 91 | endit = source.multiple(2) | ||
| 92 | assert endit == 'endstream endobj'.split() | ||
| 93 | obj.Length = str(endstream-startstream) | ||
| 94 | obj._stream = fdata[startstream:endstream] | ||
| 95 | readstream = staticmethod(readstream) | ||
| 96 | |||
| 97 | def readarray(self, source, setobj=lambda x:None, original=None): | ||
| 98 | special = self.special | ||
| 99 | result = PdfArray() | ||
| 100 | setobj(result) | ||
| 101 | |||
| 102 | for value in source: | ||
| 103 | if value == ']': | ||
| 104 | break | ||
| 105 | if value in special: | ||
| 106 | value = special[value](source) | ||
| 107 | elif value == 'R': | ||
| 108 | generation = result.pop() | ||
| 109 | value = self.readindirect(result.pop(), generation) | ||
| 110 | result.append(value) | ||
| 111 | return result | ||
| 112 | |||
| 113 | def readdict(self, source, setobj=lambda x:None, original=None): | ||
| 114 | special = self.special | ||
| 115 | result = PdfDict() | ||
| 116 | setobj(result) | ||
| 117 | |||
| 118 | tok = source.next() | ||
| 119 | while tok != '>>': | ||
| 120 | assert tok.startswith('/'), (tok, source.multiple(10)) | ||
| 121 | key = tok | ||
| 122 | value = source.next() | ||
| 123 | if value in special: | ||
| 124 | value = special[value](source) | ||
| 125 | tok = source.next() | ||
| 126 | else: | ||
| 127 | tok = source.next() | ||
| 128 | if value.isdigit() and tok.isdigit(): | ||
| 129 | assert source.next() == 'R' | ||
| 130 | value = self.readindirect(value, tok) | ||
| 131 | tok = source.next() | ||
| 132 | result[key] = value | ||
| 133 | |||
| 134 | return result | ||
| 135 | |||
| 136 | def readxref(fdata): | ||
| 137 | startloc = fdata.rindex('startxref') | ||
| 138 | xrefinfo = list(PdfTokens(fdata, startloc, False)) | ||
| 139 | assert len(xrefinfo) == 3, xrefinfo | ||
| 140 | assert xrefinfo[0] == 'startxref', xrefinfo[0] | ||
| 141 | assert xrefinfo[1].isdigit(), xrefinfo[1] | ||
| 142 | assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2]) | ||
| 143 | return startloc, PdfTokens(fdata, int(xrefinfo[1])) | ||
| 144 | readxref = staticmethod(readxref) | ||
| 145 | |||
| 146 | def parsexref(self, source): | ||
| 147 | tok = source.next() | ||
| 148 | assert tok == 'xref', tok | ||
| 149 | while 1: | ||
| 150 | tok = source.next() | ||
| 151 | if tok == 'trailer': | ||
| 152 | break | ||
| 153 | startobj = int(tok) | ||
| 154 | for objnum in range(startobj, startobj + int(source.next())): | ||
| 155 | offset = int(source.next()) | ||
| 156 | generation = int(source.next()) | ||
| 157 | if source.next() == 'n': | ||
| 158 | objid = self.fdata, objnum, generation | ||
| 159 | objval = [offset, self.unresolved] | ||
| 160 | self.indirect_objects.setdefault(objid, objval) | ||
| 161 | |||
| 162 | pagename = PdfName.Page | ||
| 163 | pagesname = PdfName.Pages | ||
| 164 | |||
| 165 | def readpages(self, node): | ||
| 166 | # PDFs can have arbitrarily nested Pages/Page | ||
| 167 | # dictionary structures. | ||
| 168 | if node.Type == self.pagename: | ||
| 169 | return [node] | ||
| 170 | assert node.Type == self.pagesname, node.Type | ||
| 171 | result = [] | ||
| 172 | for node in node.Kids: | ||
| 173 | result.extend(self.readpages(node)) | ||
| 174 | return result | ||
| 175 | |||
| 176 | def __init__(self, fname=None, fdata=None, decompress=True): | ||
| 177 | |||
| 178 | if fname is not None: | ||
| 179 | assert fdata is None | ||
| 180 | # Allow reading preexisting streams like pyPdf | ||
| 181 | if hasattr(fname, 'read'): | ||
| 182 | fdata = fname.read() | ||
| 183 | else: | ||
| 184 | f = open(fname, 'rb') | ||
| 185 | fdata = f.read() | ||
| 186 | f.close() | ||
| 187 | |||
| 188 | assert fdata is not None | ||
| 189 | fdata = fdata.rstrip('\00') | ||
| 190 | self.private.fdata = fdata | ||
| 191 | |||
| 192 | self.private.indirect_objects = {} | ||
| 193 | self.private.special = {'<<': self.readdict, '[': self.readarray} | ||
| 194 | |||
| 195 | startloc, source = self.readxref(fdata) | ||
| 196 | self.parsexref(source) | ||
| 197 | assert source.next() == '<<' | ||
| 198 | self.update(self.readdict(source)) | ||
| 199 | assert source.next() == 'startxref' and source.floc > startloc | ||
| 200 | self.private.pages = self.readpages(self.Root.Pages) | ||
| 201 | if decompress: | ||
| 202 | self.uncompress() | ||
| 203 | |||
| 204 | # For compatibility with pyPdf | ||
| 205 | self.private.numPages = len(self.pages) | ||
| 206 | |||
| 207 | |||
| 208 | # For compatibility with pyPdf | ||
| 209 | def getPage(self, pagenum): | ||
| 210 | return self.pages[pagenum] | ||
| 211 | |||
| 212 | def uncompress(self): | ||
| 213 | uncompress([x[1] for x in self.indirect_objects.itervalues()]) | ||
diff --git a/lib/pdfrw/pdftokens.py b/lib/pdfrw/pdftokens.py new file mode 100644 index 0000000..04bd559 --- /dev/null +++ b/lib/pdfrw/pdftokens.py | |||
| @@ -0,0 +1,249 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | A tokenizer for PDF streams. | ||
| 7 | |||
| 8 | In general, documentation used was "PDF reference", | ||
| 9 | sixth edition, for PDF version 1.7, dated November 2006. | ||
| 10 | |||
| 11 | ''' | ||
| 12 | |||
| 13 | from __future__ import generators | ||
| 14 | |||
| 15 | try: | ||
| 16 | set | ||
| 17 | except NameError: | ||
| 18 | from sets import Set as set | ||
| 19 | |||
| 20 | import re | ||
| 21 | from pdfobjects import PdfString, PdfObject | ||
| 22 | |||
| 23 | class _PrimitiveTokens(object): | ||
| 24 | |||
| 25 | # Table 3.1, page 50 of reference, defines whitespace | ||
| 26 | whitespaceset = set('\x00\t\n\f\r ') | ||
| 27 | |||
| 28 | |||
| 29 | # Text on page 50 defines delimiter characters | ||
| 30 | delimiterset = set('()<>{}[]/%') | ||
| 31 | |||
| 32 | # Coalesce contiguous whitespace into a single token | ||
| 33 | whitespace_pattern = '[%s]+' % ''.join(whitespaceset) | ||
| 34 | |||
| 35 | # In addition to the delimiters, we also use '\', which | ||
| 36 | # is special in some contexts in PDF. | ||
| 37 | delimiter_pattern = '\\\\|\\' + '|\\'.join(delimiterset) | ||
| 38 | |||
| 39 | # Dictionary delimiters are '<<' and '>>'. Look for | ||
| 40 | # these before the single variety. | ||
| 41 | dictdelim_pattern = r'\<\<|\>\>' | ||
| 42 | |||
| 43 | pattern = '(%s|%s|%s)' % (whitespace_pattern, | ||
| 44 | dictdelim_pattern, delimiter_pattern) | ||
| 45 | re_func = re.compile(pattern).finditer | ||
| 46 | del whitespace_pattern, dictdelim_pattern | ||
| 47 | del delimiter_pattern, pattern | ||
| 48 | |||
| 49 | def __init__(self, fdata): | ||
| 50 | |||
| 51 | class MyIterator(object): | ||
| 52 | def next(): | ||
| 53 | if not tokens: | ||
| 54 | startloc = self.startloc | ||
| 55 | for match in next_match[0]: | ||
| 56 | start = match.start() | ||
| 57 | end = match.end() | ||
| 58 | tappend(fdata[start:end]) | ||
| 59 | if start > startloc: | ||
| 60 | tappend(fdata[startloc:start]) | ||
| 61 | self.startloc = end | ||
| 62 | break | ||
| 63 | else: | ||
| 64 | s = fdata[startloc:] | ||
| 65 | self.startloc = len(fdata) | ||
| 66 | if s: | ||
| 67 | tappend(s) | ||
| 68 | if not tokens: | ||
| 69 | raise StopIteration | ||
| 70 | return tpop() | ||
| 71 | next = staticmethod(next) | ||
| 72 | |||
| 73 | self.fdata = fdata | ||
| 74 | self.tokens = tokens = [] | ||
| 75 | self.iterator = iterator = MyIterator() | ||
| 76 | self.next = iterator.next | ||
| 77 | self.next_match = next_match = [None] | ||
| 78 | tappend = tokens.append | ||
| 79 | tpop = tokens.pop | ||
| 80 | |||
| 81 | def setstart(self, startloc): | ||
| 82 | self.startloc = startloc | ||
| 83 | self.next_match[0] = self.re_func(self.fdata, startloc) | ||
| 84 | |||
| 85 | def __iter__(self): | ||
| 86 | return self.iterator | ||
| 87 | |||
| 88 | def coalesce(self, result): | ||
| 89 | ''' This function coalesces tokens together up until | ||
| 90 | the next delimiter or whitespace. | ||
| 91 | All of the coalesced tokens will either be non-matches, | ||
| 92 | or will be a matched backslash. We distinguish the | ||
| 93 | non-matches by the fact that next() will have left | ||
| 94 | a following match inside self.tokens for the actual match. | ||
| 95 | ''' | ||
| 96 | tokens = self.tokens | ||
| 97 | whitespace = self.whitespaceset | ||
| 98 | |||
| 99 | # Optimized path for usual case -- regular data (not a name string), | ||
| 100 | # with no escape character, and followed by whitespace. | ||
| 101 | |||
| 102 | if tokens: | ||
| 103 | token = tokens.pop() | ||
| 104 | if token != '\\': | ||
| 105 | if token[0] not in whitespace: | ||
| 106 | tokens.append(token) | ||
| 107 | return | ||
| 108 | result.append(token) | ||
| 109 | |||
| 110 | # Non-optimized path. Either start of a name string received, | ||
| 111 | # or we just had one escape. | ||
| 112 | |||
| 113 | for token in self: | ||
| 114 | if tokens: | ||
| 115 | result.append(token) | ||
| 116 | token = tokens.pop() | ||
| 117 | if token != '\\': | ||
| 118 | if token[0] not in whitespace: | ||
| 119 | tokens.append(token) | ||
| 120 | return | ||
| 121 | result.append(token) | ||
| 122 | |||
| 123 | |||
| 124 | def floc(self): | ||
| 125 | return self.startloc - sum([len(x) for x in self.tokens]) | ||
| 126 | |||
| 127 | class PdfTokens(object): | ||
| 128 | |||
| 129 | def __init__(self, fdata, startloc=0, strip_comments=True): | ||
| 130 | |||
| 131 | def comment(token): | ||
| 132 | tokens = [token] | ||
| 133 | for token in primitive: | ||
| 134 | tokens.append(token) | ||
| 135 | if token[0] in whitespaceset and ('\n' in token or '\r' in token): | ||
| 136 | break | ||
| 137 | return not strip_comments and ''.join(tokens) | ||
| 138 | |||
| 139 | def single(token): | ||
| 140 | return token | ||
| 141 | |||
| 142 | def regular_string(token): | ||
| 143 | def escaped(): | ||
| 144 | escaped = False | ||
| 145 | i = -2 | ||
| 146 | while tokens[i] == '\\': | ||
| 147 | escaped = not escaped | ||
| 148 | i -= 1 | ||
| 149 | return escaped | ||
| 150 | |||
| 151 | tokens = [token] | ||
| 152 | nestlevel = 1 | ||
| 153 | for token in primitive: | ||
| 154 | tokens.append(token) | ||
| 155 | if token in '()' and not escaped(): | ||
| 156 | nestlevel += token == '(' or -1 | ||
| 157 | if not nestlevel: | ||
| 158 | break | ||
| 159 | else: | ||
| 160 | assert 0, "Unexpected end of token stream" | ||
| 161 | return PdfString(''.join(tokens)) | ||
| 162 | |||
| 163 | def hex_string(token): | ||
| 164 | tokens = [token] | ||
| 165 | for token in primitive: | ||
| 166 | tokens.append(token) | ||
| 167 | if token == '>': | ||
| 168 | break | ||
| 169 | while tokens[-2] == '>>': | ||
| 170 | tokens.append(tokens.pop(-2)) | ||
| 171 | return PdfString(''.join(tokens)) | ||
| 172 | |||
| 173 | def normal_data(token): | ||
| 174 | |||
| 175 | # Obscure optimization -- we can get here with | ||
| 176 | # whitespace or regular character data. If we get | ||
| 177 | # here with whitespace, then there won't be an additional | ||
| 178 | # token queued up in the primitive object, otherwise there | ||
| 179 | # will... | ||
| 180 | if primitive_tokens: #if token[0] not in whitespaceset: | ||
| 181 | tokens = [token] | ||
| 182 | primitive.coalesce(tokens) | ||
| 183 | return PdfObject(''.join(tokens)) | ||
| 184 | |||
| 185 | def name_string(token): | ||
| 186 | tokens = [token] | ||
| 187 | primitive.coalesce(tokens) | ||
| 188 | token = ''.join(tokens) | ||
| 189 | if '#' in token: | ||
| 190 | substrs = token.split('#') | ||
| 191 | substrs.reverse() | ||
| 192 | tokens = [substrs.pop()] | ||
| 193 | while substrs: | ||
| 194 | s = substrs.pop() | ||
| 195 | tokens.append(chr(int(s[:2], 16))) | ||
| 196 | tokens.append(s[2:]) | ||
| 197 | token = ''.join(tokens) | ||
| 198 | return PdfObject(token) | ||
| 199 | |||
| 200 | def broken(token): | ||
| 201 | assert 0, token | ||
| 202 | |||
| 203 | dispatch = { | ||
| 204 | '(': regular_string, | ||
| 205 | ')': broken, | ||
| 206 | '<': hex_string, | ||
| 207 | '>': broken, | ||
| 208 | '[': single, | ||
| 209 | ']': single, | ||
| 210 | '{': single, | ||
| 211 | '}': single, | ||
| 212 | '/': name_string, | ||
| 213 | '%' : comment, | ||
| 214 | '<<': single, | ||
| 215 | '>>': single, | ||
| 216 | }.get | ||
| 217 | |||
| 218 | class MyIterator(object): | ||
| 219 | def next(): | ||
| 220 | while not tokens: | ||
| 221 | token = primitive_next() | ||
| 222 | token = dispatch(token, normal_data)(token) | ||
| 223 | if token: | ||
| 224 | return token | ||
| 225 | return tokens.pop() | ||
| 226 | next = staticmethod(next) | ||
| 227 | |||
| 228 | self.primitive = primitive = _PrimitiveTokens(fdata) | ||
| 229 | self.setstart = primitive.setstart | ||
| 230 | primitive.setstart(startloc) | ||
| 231 | self.fdata = fdata | ||
| 232 | self.strip_comments = strip_comments | ||
| 233 | self.tokens = tokens = [] | ||
| 234 | self.iterator = iterator = MyIterator() | ||
| 235 | self.next = iterator.next | ||
| 236 | primitive_next = primitive.next | ||
| 237 | primitive_tokens = primitive.tokens | ||
| 238 | whitespaceset = _PrimitiveTokens.whitespaceset | ||
| 239 | |||
| 240 | def floc(self): | ||
| 241 | return self.primitive.floc() - sum([len(x) for x in self.tokens]) | ||
| 242 | floc = property(floc) | ||
| 243 | |||
| 244 | def __iter__(self): | ||
| 245 | return self.iterator | ||
| 246 | |||
| 247 | def multiple(self, count): | ||
| 248 | next = self.next | ||
| 249 | return [next() for i in range(count)] | ||
diff --git a/lib/pdfrw/pdfwriter.py b/lib/pdfrw/pdfwriter.py new file mode 100755 index 0000000..c193843 --- /dev/null +++ b/lib/pdfrw/pdfwriter.py | |||
| @@ -0,0 +1,234 @@ | |||
| 1 | #!/usr/bin/env python | ||
| 2 | |||
| 3 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 4 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 5 | # MIT license -- See LICENSE.txt for details | ||
| 6 | |||
| 7 | ''' | ||
| 8 | The PdfWriter class writes an entire PDF file out to disk. | ||
| 9 | |||
| 10 | The writing process is not at all optimized or organized. | ||
| 11 | |||
| 12 | An instance of the PdfWriter class has two methods: | ||
| 13 | addpage(page) | ||
| 14 | and | ||
| 15 | write(fname) | ||
| 16 | |||
| 17 | addpage() assumes that the pages are part of a valid | ||
| 18 | tree/forest of PDF objects. | ||
| 19 | ''' | ||
| 20 | |||
| 21 | try: | ||
| 22 | set | ||
| 23 | except NameError: | ||
| 24 | from sets import Set as set | ||
| 25 | |||
| 26 | from pdfobjects import PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfString | ||
| 27 | from pdfcompress import compress | ||
| 28 | |||
| 29 | debug = False | ||
| 30 | |||
| 31 | class FormatObjects(object): | ||
| 32 | ''' FormatObjects performs the actual formatting and disk write. | ||
| 33 | ''' | ||
| 34 | |||
| 35 | def add(self, obj, visited): | ||
| 36 | ''' Add an object to our list, if it's an indirect | ||
| 37 | object. Just format it if not. | ||
| 38 | ''' | ||
| 39 | # Can't hash dicts, so just hash the object ID | ||
| 40 | objid = id(obj) | ||
| 41 | |||
| 42 | # Automatically set stream objects to indirect | ||
| 43 | if isinstance(obj, PdfDict): | ||
| 44 | indirect = obj.indirect or (obj.stream is not None) | ||
| 45 | else: | ||
| 46 | indirect = getattr(obj, 'indirect', False) | ||
| 47 | |||
| 48 | if not indirect: | ||
| 49 | assert objid not in visited, \ | ||
| 50 | 'Circular reference encountered in non-indirect object %s' % repr(obj) | ||
| 51 | visited.add(objid) | ||
| 52 | result = self.format_obj(obj, visited) | ||
| 53 | visited.remove(objid) | ||
| 54 | return result | ||
| 55 | |||
| 56 | objnum = self.indirect_dict.get(objid) | ||
| 57 | |||
| 58 | # If we haven't seen the object yet, we need to | ||
| 59 | # add it to the indirect object list. | ||
| 60 | if objnum is None: | ||
| 61 | objlist = self.objlist | ||
| 62 | objnum = len(objlist) + 1 | ||
| 63 | if debug: | ||
| 64 | print ' Object', objnum, '\r', | ||
| 65 | objlist.append(None) | ||
| 66 | self.indirect_dict[objid] = objnum | ||
| 67 | objlist[objnum-1] = self.format_obj(obj) | ||
| 68 | return '%s 0 R' % objnum | ||
| 69 | |||
| 70 | def format_array(myarray, formatter): | ||
| 71 | # Format array data into semi-readable ASCII | ||
| 72 | if sum([len(x) for x in myarray]) <= 70: | ||
| 73 | return formatter % ' '.join(myarray) | ||
| 74 | bigarray = [] | ||
| 75 | count = 1000000 | ||
| 76 | for x in myarray: | ||
| 77 | lenx = len(x) | ||
| 78 | if lenx + count > 70: | ||
| 79 | subarray = [] | ||
| 80 | bigarray.append(subarray) | ||
| 81 | count = 0 | ||
| 82 | count += lenx + 1 | ||
| 83 | subarray.append(x) | ||
| 84 | return formatter % '\n '.join([' '.join(x) for x in bigarray]) | ||
| 85 | format_array = staticmethod(format_array) | ||
| 86 | |||
| 87 | def format_obj(self, obj, visited=None): | ||
| 88 | ''' format PDF object data into semi-readable ASCII. | ||
| 89 | May mutually recurse with add() -- add() will | ||
| 90 | return references for indirect objects, and add | ||
| 91 | the indirect object to the list. | ||
| 92 | ''' | ||
| 93 | if visited is None: | ||
| 94 | visited = set() | ||
| 95 | if isinstance(obj, PdfArray): | ||
| 96 | myarray = [self.add(x, visited) for x in obj] | ||
| 97 | return self.format_array(myarray, '[%s]') | ||
| 98 | elif isinstance(obj, PdfDict): | ||
| 99 | if self.compress and obj.stream: | ||
| 100 | compress([obj]) | ||
| 101 | myarray = [] | ||
| 102 | # Jython 2.2.1 has a bug which segfaults when | ||
| 103 | # sorting subclassed strings, so we un-subclass them. | ||
| 104 | dictkeys = [str(x) for x in obj.iterkeys()] | ||
| 105 | dictkeys.sort() | ||
| 106 | for key in dictkeys: | ||
| 107 | myarray.append(key) | ||
| 108 | myarray.append(self.add(obj[key], visited)) | ||
| 109 | result = self.format_array(myarray, '<<%s>>') | ||
| 110 | stream = obj.stream | ||
| 111 | if stream is not None: | ||
| 112 | result = '%s\nstream\n%s\nendstream' % (result, stream) | ||
| 113 | return result | ||
| 114 | elif isinstance(obj, basestring) and not hasattr(obj, 'indirect'): | ||
| 115 | return PdfString.encode(obj) | ||
| 116 | else: | ||
| 117 | return str(obj) | ||
| 118 | |||
| 119 | def dump(cls, f, trailer, version='1.3', compress=True): | ||
| 120 | self = cls() | ||
| 121 | self.compress = compress | ||
| 122 | self.indirect_dict = {} | ||
| 123 | self.objlist = [] | ||
| 124 | |||
| 125 | # The first format of trailer gets all the information, | ||
| 126 | # but we throw away the actual trailer formatting. | ||
| 127 | self.format_obj(trailer) | ||
| 128 | # Now we know the size, so we update the trailer dict | ||
| 129 | # and get the formatted data. | ||
| 130 | trailer.Size = PdfObject(len(self.objlist) + 1) | ||
| 131 | trailer = self.format_obj(trailer) | ||
| 132 | |||
| 133 | # Now we have all the pieces to write out to the file. | ||
| 134 | # Keep careful track of the counts while we do it so | ||
| 135 | # we can correctly build the cross-reference. | ||
| 136 | |||
| 137 | header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version | ||
| 138 | f.write(header) | ||
| 139 | offset = len(header) | ||
| 140 | offsets = [(0, 65535, 'f')] | ||
| 141 | |||
| 142 | for i, x in enumerate(self.objlist): | ||
| 143 | objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x) | ||
| 144 | offsets.append((offset, 0, 'n')) | ||
| 145 | offset += len(objstr) | ||
| 146 | f.write(objstr) | ||
| 147 | |||
| 148 | f.write('xref\n0 %s\n' % len(offsets)) | ||
| 149 | for x in offsets: | ||
| 150 | f.write('%010d %05d %s\r\n' % x) | ||
| 151 | f.write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset)) | ||
| 152 | dump = classmethod(dump) | ||
| 153 | |||
| 154 | class PdfWriter(object): | ||
| 155 | |||
| 156 | _trailer = None | ||
| 157 | |||
| 158 | def __init__(self, version='1.3', compress=True): | ||
| 159 | self.pagearray = PdfArray() | ||
| 160 | self.compress = compress | ||
| 161 | self.version = version | ||
| 162 | |||
| 163 | def addpage(self, page): | ||
| 164 | self._trailer = None | ||
| 165 | assert page.Type == PdfName.Page | ||
| 166 | inheritable = page.inheritable # searches for resources | ||
| 167 | self.pagearray.append( | ||
| 168 | IndirectPdfDict( | ||
| 169 | page, | ||
| 170 | Resources = inheritable.Resources, | ||
| 171 | MediaBox = inheritable.MediaBox, | ||
| 172 | CropBox = inheritable.CropBox, | ||
| 173 | Rotate = inheritable.Rotate, | ||
| 174 | ) | ||
| 175 | ) | ||
| 176 | return self | ||
| 177 | |||
| 178 | addPage = addpage # for compatibility with pyPdf | ||
| 179 | |||
| 180 | def addpages(self, pagelist): | ||
| 181 | for page in pagelist: | ||
| 182 | self.addpage(page) | ||
| 183 | return self | ||
| 184 | |||
| 185 | def _get_trailer(self): | ||
| 186 | trailer = self._trailer | ||
| 187 | if trailer is not None: | ||
| 188 | return trailer | ||
| 189 | |||
| 190 | # Create the basic object structure of the PDF file | ||
| 191 | trailer = PdfDict( | ||
| 192 | Root = IndirectPdfDict( | ||
| 193 | Type = PdfName.Catalog, | ||
| 194 | Pages = IndirectPdfDict( | ||
| 195 | Type = PdfName.Pages, | ||
| 196 | Count = PdfObject(len(self.pagearray)), | ||
| 197 | Kids = self.pagearray | ||
| 198 | ) | ||
| 199 | ) | ||
| 200 | ) | ||
| 201 | # Make all the pages point back to the page dictionary | ||
| 202 | pagedict = trailer.Root.Pages | ||
| 203 | for page in pagedict.Kids: | ||
| 204 | page.Parent = pagedict | ||
| 205 | self._trailer = trailer | ||
| 206 | return trailer | ||
| 207 | |||
| 208 | def _set_trailer(self, trailer): | ||
| 209 | self._trailer = trailer | ||
| 210 | |||
| 211 | trailer = property(_get_trailer, _set_trailer) | ||
| 212 | |||
| 213 | def write(self, fname, trailer=None): | ||
| 214 | trailer = trailer or self.trailer | ||
| 215 | |||
| 216 | # Dump the data. We either have a filename or a preexisting | ||
| 217 | # file object. | ||
| 218 | preexisting = hasattr(fname, 'write') | ||
| 219 | f = preexisting and fname or open(fname, 'wb') | ||
| 220 | FormatObjects.dump(f, trailer, self.version, self.compress) | ||
| 221 | if not preexisting: | ||
| 222 | f.close() | ||
| 223 | |||
| 224 | if __name__ == '__main__': | ||
| 225 | debug = True | ||
| 226 | import pdfreader | ||
| 227 | x = pdfreader.PdfReader('source.pdf') | ||
| 228 | y = PdfWriter() | ||
| 229 | for i, page in enumerate(x.pages): | ||
| 230 | print ' Adding page', i+1, '\r', | ||
| 231 | y.addpage(page) | ||
| 232 | |||
| 233 | y.write('result.pdf') | ||
| 234 | |||
diff --git a/lib/pdfrw/toreportlab.py b/lib/pdfrw/toreportlab.py new file mode 100644 index 0000000..00ad324 --- /dev/null +++ b/lib/pdfrw/toreportlab.py | |||
| @@ -0,0 +1,139 @@ | |||
| 1 | # A part of pdfrw (pdfrw.googlecode.com) | ||
| 2 | # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas | ||
| 3 | # MIT license -- See LICENSE.txt for details | ||
| 4 | |||
| 5 | ''' | ||
| 6 | Converts pdfrw objects into reportlab objects. | ||
| 7 | |||
| 8 | Designed for and tested with rl 2.3. | ||
| 9 | |||
| 10 | Knows too much about reportlab internals. | ||
| 11 | What can you do? | ||
| 12 | |||
| 13 | The interface to this function is through the makerl() function. | ||
| 14 | |||
| 15 | Parameters: | ||
| 16 | canv - a reportlab "canvas" (also accepts a "document") | ||
| 17 | pdfobj - a pdfrw PDF object | ||
| 18 | |||
| 19 | Returns: | ||
| 20 | A corresponding reportlab object, or if the | ||
| 21 | object is a PDF Form XObject, the name to | ||
| 22 | use with reportlab for the object. | ||
| 23 | |||
| 24 | Will recursively convert all necessary objects. | ||
| 25 | Be careful when converting a page -- if /Parent is set, | ||
| 26 | will recursively convert all pages! | ||
| 27 | |||
| 28 | Notes: | ||
| 29 | 1) Original objects are annotated with a | ||
| 30 | derived_rl_obj attribute which points to the | ||
| 31 | reportlab object. This keeps multiple reportlab | ||
| 32 | objects from being generated for the same pdfobj | ||
| 33 | via repeated calls to makerl. This is great for | ||
| 34 | not putting too many objects into the | ||
| 35 | new PDF, but not so good if you are modifying | ||
| 36 | objects for different pages. Then you | ||
| 37 | need to do your own deep copying (of circular | ||
| 38 | structures). You're on your own. | ||
| 39 | |||
| 40 | 2) ReportLab seems weird about FormXObjects. | ||
| 41 | They pass around a partial name instead of the | ||
| 42 | object or a reference to it. So we have to | ||
| 43 | reach into reportlab and get a number for | ||
| 44 | a unique name. I guess this is to make it | ||
| 45 | where you can combine page streams with | ||
| 46 | impunity, but that's just a guess. | ||
| 47 | |||
| 48 | 3) Updated 1/23/2010 to handle multipass documents | ||
| 49 | (e.g. with a table of contents). These have | ||
| 50 | a different doc object on every pass. | ||
| 51 | |||
| 52 | ''' | ||
| 53 | |||
| 54 | from reportlab.pdfbase import pdfdoc as rldocmodule | ||
| 55 | from pdfobjects import PdfDict, PdfArray, PdfName | ||
| 56 | |||
| 57 | RLStream = rldocmodule.PDFStream | ||
| 58 | RLDict = rldocmodule.PDFDictionary | ||
| 59 | RLArray = rldocmodule.PDFArray | ||
| 60 | |||
| 61 | |||
| 62 | def _makedict(rldoc, pdfobj): | ||
| 63 | rlobj = rldict = RLDict() | ||
| 64 | if pdfobj.indirect: | ||
| 65 | rlobj.__RefOnly__ = 1 | ||
| 66 | rlobj = rldoc.Reference(rlobj) | ||
| 67 | pdfobj.derived_rl_obj[rldoc] = rlobj, None | ||
| 68 | |||
| 69 | for key, value in pdfobj.iteritems(): | ||
| 70 | rldict[key[1:]] = makerl_recurse(rldoc, value) | ||
| 71 | |||
| 72 | return rlobj | ||
| 73 | |||
| 74 | def _makestream(rldoc, pdfobj, xobjtype=PdfName.XObject): | ||
| 75 | rldict = RLDict() | ||
| 76 | rlobj = RLStream(rldict, pdfobj.stream) | ||
| 77 | |||
| 78 | if pdfobj.Type == xobjtype: | ||
| 79 | shortname = 'pdfrw_%s' % (rldoc.objectcounter+1) | ||
| 80 | fullname = rldoc.getXObjectName(shortname) | ||
| 81 | else: | ||
| 82 | shortname = fullname = None | ||
| 83 | result = rldoc.Reference(rlobj, fullname) | ||
| 84 | pdfobj.derived_rl_obj[rldoc] = result, shortname | ||
| 85 | |||
| 86 | for key, value in pdfobj.iteritems(): | ||
| 87 | rldict[key[1:]] = makerl_recurse(rldoc, value) | ||
| 88 | |||
| 89 | return result | ||
| 90 | |||
| 91 | def _makearray(rldoc, pdfobj): | ||
| 92 | rlobj = rlarray = RLArray([]) | ||
| 93 | if pdfobj.indirect: | ||
| 94 | rlobj.__RefOnly__ = 1 | ||
| 95 | rlobj = rldoc.Reference(rlobj) | ||
| 96 | pdfobj.derived_rl_obj[rldoc] = rlobj, None | ||
| 97 | |||
| 98 | mylist = rlarray.sequence | ||
| 99 | for value in pdfobj: | ||
| 100 | mylist.append(makerl_recurse(rldoc, value)) | ||
| 101 | |||
| 102 | return rlobj | ||
| 103 | |||
| 104 | def _makestr(rldoc, pdfobj): | ||
| 105 | assert isinstance(pdfobj, (float, int, str)), repr(pdfobj) | ||
| 106 | return pdfobj | ||
| 107 | |||
| 108 | def makerl_recurse(rldoc, pdfobj): | ||
| 109 | docdict = getattr(pdfobj, 'derived_rl_obj', None) | ||
| 110 | if docdict is not None: | ||
| 111 | value = docdict.get(rldoc) | ||
| 112 | if value is not None: | ||
| 113 | return value[0] | ||
| 114 | if isinstance(pdfobj, PdfDict): | ||
| 115 | if pdfobj.stream is not None: | ||
| 116 | func = _makestream | ||
| 117 | else: | ||
| 118 | func = _makedict | ||
| 119 | if docdict is None: | ||
| 120 | pdfobj.private.derived_rl_obj = {} | ||
| 121 | elif isinstance(pdfobj, PdfArray): | ||
| 122 | func = _makearray | ||
| 123 | if docdict is None: | ||
| 124 | pdfobj.derived_rl_obj = {} | ||
| 125 | else: | ||
| 126 | func = _makestr | ||
| 127 | return func(rldoc, pdfobj) | ||
| 128 | |||
| 129 | def makerl(canv, pdfobj): | ||
| 130 | try: | ||
| 131 | rldoc = canv._doc | ||
| 132 | except AttributeError: | ||
| 133 | rldoc = canv | ||
| 134 | rlobj = makerl_recurse(rldoc, pdfobj) | ||
| 135 | try: | ||
| 136 | name = pdfobj.derived_rl_obj[rldoc][1] | ||
| 137 | except AttributeError: | ||
| 138 | name = None | ||
| 139 | return name or rlobj | ||
diff --git a/lib/sounds.py b/lib/sounds.py new file mode 100644 index 0000000..a4bf5b6 --- /dev/null +++ b/lib/sounds.py | |||
| @@ -0,0 +1 @@ | |||
| import parser | |||
