summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorjvoisin2011-06-21 20:41:18 +0200
committerjvoisin2011-06-21 20:41:18 +0200
commit9e69adbe1b065707f8be4f146cc3c05660cef711 (patch)
treed60509a4982d7699204059184c4343352fef52de /lib
parentf0c9c5b56e3909ba36cc84ff82b05fab9a180911 (diff)
Add pdfrw, and many files that I have forgetten, sorry !
Diffstat (limited to 'lib')
-rw-r--r--lib/archive.py6
-rw-r--r--lib/audio.py8
-rw-r--r--lib/mat.py2
-rw-r--r--lib/misc.py44
-rw-r--r--lib/pdfrw/__init__.py13
-rw-r--r--lib/pdfrw/buildxobj.py191
-rw-r--r--lib/pdfrw/pdfcompress.py57
-rw-r--r--lib/pdfrw/pdfobjects.py183
-rw-r--r--lib/pdfrw/pdfreader.py213
-rw-r--r--lib/pdfrw/pdftokens.py249
-rwxr-xr-xlib/pdfrw/pdfwriter.py234
-rw-r--r--lib/pdfrw/toreportlab.py139
-rw-r--r--lib/sounds.py1
13 files changed, 1340 insertions, 0 deletions
diff --git a/lib/archive.py b/lib/archive.py
new file mode 100644
index 0000000..6378cab
--- /dev/null
+++ b/lib/archive.py
@@ -0,0 +1,6 @@
1import parser
2
3class TarStripper(parser.Generic_parser):
4 def remove_all(self):
5 for file in self.editor.array("file"):
6 print file.name
diff --git a/lib/audio.py b/lib/audio.py
new file mode 100644
index 0000000..6d653bc
--- /dev/null
+++ b/lib/audio.py
@@ -0,0 +1,8 @@
1import parser
2
3class MpegAudioStripper(parser.Generic_parser):
4 def _should_remove(self, field):
5 if field.name in ("id3v1", "id3v2"):
6 return True
7 else:
8 return False
diff --git a/lib/mat.py b/lib/mat.py
index 3cbd81b..a9b8e17 100644
--- a/lib/mat.py
+++ b/lib/mat.py
@@ -14,6 +14,7 @@ import hachoir_editor
14import images 14import images
15import audio 15import audio
16import misc 16import misc
17import archive
17 18
18__version__ = "0.1" 19__version__ = "0.1"
19__author__ = "jvoisin" 20__author__ = "jvoisin"
@@ -23,6 +24,7 @@ strippers = {
23 hachoir_parser.image.PngFile: images.PngStripper, 24 hachoir_parser.image.PngFile: images.PngStripper,
24 hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper, 25 hachoir_parser.audio.MpegAudioFile: audio.MpegAudioStripper,
25 hachoir_parser.misc.PDFDocument: misc.PdfStripper, 26 hachoir_parser.misc.PDFDocument: misc.PdfStripper,
27 hachoir_parser.archive.TarFile: archive.TarStripper,
26} 28}
27 29
28def create_class_file(name): 30def create_class_file(name):
diff --git a/lib/misc.py b/lib/misc.py
new file mode 100644
index 0000000..56c2274
--- /dev/null
+++ b/lib/misc.py
@@ -0,0 +1,44 @@
1import parser
2import pdfrw
3
4class PdfStripper(parser.Generic_parser):
5 '''
6 Represent a pdf file, with the help of pdfrw
7 '''
8 def __init__(self, filename):
9 self.filename = filename
10 self.trailer = pdfrw.PdfReader(self.filename)
11 self.writer = pdfrw.PdfWriter()
12
13 def remove_all(self):
14 '''
15 Remove all the files that are compromizing
16 '''
17 self.trailer.Info.Title = ''
18 self.trailer.Info.Author = ''
19 self.trailer.Info.Producer = ''
20 self.trailer.Info.Creator = ''
21 self.trailer.Info.CreationDate = ''
22 self.trailer.Info.ModDate = ''
23
24 self.writer.trailer = self.trailer
25 self.writer.write(self.filename + parser.POSTFIX)
26
27 def is_clean(self):
28 '''
29 Check if the file is clean from harmful metadatas
30 '''
31 for field in self.trailer.Info:
32 if field != '':
33 return False
34 return True
35
36 def get_meta(self):
37 '''
38 return a dict with all the meta of the file
39 '''
40 metadata = {}
41 for key, value in self.trailer.Info.iteritems():
42 metadata[key[1:]] = value[1:-1]
43 return metadata
44
diff --git a/lib/pdfrw/__init__.py b/lib/pdfrw/__init__.py
new file mode 100644
index 0000000..964972f
--- /dev/null
+++ b/lib/pdfrw/__init__.py
@@ -0,0 +1,13 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5from pdfwriter import PdfWriter
6from pdfreader import PdfReader
7from pdfobjects import PdfObject, PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfString
8from pdftokens import PdfTokens
9
10# Add a tiny bit of compatibility to pyPdf
11
12PdfFileReader = PdfReader
13PdfFileWriter = PdfWriter
diff --git a/lib/pdfrw/buildxobj.py b/lib/pdfrw/buildxobj.py
new file mode 100644
index 0000000..203dd8c
--- /dev/null
+++ b/lib/pdfrw/buildxobj.py
@@ -0,0 +1,191 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6
7This module contains code to build PDF "Form XObjects".
8
9A Form XObject allows a fragment from one PDF file to be cleanly
10included in another PDF file.
11
12Reference for syntax: "Parameters for opening PDF files" from SDK 8.1
13
14 http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf
15
16 supported 'page=xxx', 'viewrect=<left>,<top>,<width>,<height>'
17
18 Units are in points
19
20Reference for content: Adobe PDF reference, sixth edition, version 1.7
21
22 http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
23
24 Form xobjects discussed chapter 4.9, page 355
25'''
26
27from pdfobjects import PdfDict, PdfArray, PdfName
28from pdfreader import PdfReader
29
30class ViewInfo(object):
31 ''' Instantiate ViewInfo with a uri, and it will parse out
32 the filename, page, and viewrect into object attributes.
33 '''
34 doc = None
35 docname = None
36 page = None
37 viewrect = None
38
39 def __init__(self, pageinfo='', **kw):
40 pageinfo=pageinfo.split('#',1)
41 if len(pageinfo) == 2:
42 pageinfo[1:] = pageinfo[1].replace('&', '#').split('#')
43 for key in 'page viewrect'.split():
44 if pageinfo[0].startswith(key+'='):
45 break
46 else:
47 self.docname = pageinfo.pop(0)
48 for item in pageinfo:
49 key, value = item.split('=')
50 key = key.strip()
51 value = value.replace(',', ' ').split()
52 if key == 'page':
53 assert len(value) == 1
54 setattr(self, key, int(value[0]))
55 elif key == 'viewrect':
56 assert len(value) == 4
57 setattr(self, key, [float(x) for x in value])
58 else:
59 log.error('Unknown option: %s', key)
60 for key, value in kw.iteritems():
61 assert hasattr(self, key), key
62 setattr(self, key, value)
63
64def getrects(inheritable, pageinfo):
65 ''' Given the inheritable attributes of a page and
66 the desired pageinfo rectangle, return the page's
67 media box and the calculated boundary (clip) box.
68 '''
69 mbox = tuple([float(x) for x in inheritable.MediaBox])
70 vrect = pageinfo.viewrect
71 if vrect is None:
72 cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)])
73 else:
74 mleft, mbot, mright, mtop = mbox
75 x, y, w, h = vrect
76 cleft = mleft + x
77 ctop = mtop - y
78 cright = cleft + w
79 cbot = ctop - h
80 cbox = max(mleft, cleft), max(mbot, cbot), min(mright, cright), min(mtop, ctop)
81 return mbox, cbox
82
83def _cache_xobj(contents, resources, mbox, bbox):
84 ''' Return a cached Form XObject, or create a new one and cache it.
85 '''
86 cachedict = contents.xobj_cachedict
87 if cachedict is None:
88 cachedict = contents.private.xobj_cachedict = {}
89 result = cachedict.get(bbox)
90 if result is None:
91 func = (_get_fullpage, _get_subpage)[mbox != bbox]
92 result = PdfDict(
93 func(contents, resources, mbox, bbox),
94 Type = PdfName.XObject,
95 Subtype = PdfName.Form,
96 FormType = 1,
97 BBox = PdfArray(bbox),
98 )
99 cachedict[bbox] = result
100 return result
101
102def _get_fullpage(contents, resources, mbox, bbox):
103 ''' fullpage is easy. Just copy the contents,
104 set up the resources, and let _cache_xobj handle the
105 rest.
106 '''
107 return PdfDict(contents, Resources=resources)
108
109def _get_subpage(contents, resources, mbox, bbox):
110 ''' subpages *could* be as easy as full pages, but we
111 choose to complicate life by creating a Form XObject
112 for the page, and then one that references it for
113 the subpage, on the off-chance that we want multiple
114 items from the page.
115 '''
116 return PdfDict(
117 stream = '/FullPage Do\n',
118 Resources = PdfDict(
119 XObject = PdfDict(
120 FullPage = _cache_xobj(contents, resources, mbox, mbox)
121 )
122 )
123 )
124
125def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True):
126 ''' pagexobj creates and returns a Form XObject for
127 a given view within a page (Defaults to entire page.)
128 '''
129 inheritable = page.inheritable
130 resources = inheritable.Resources
131 mbox, bbox = getrects(inheritable, viewinfo)
132 contents = page.Contents
133 # Make sure the only attribute is length
134 # All the filters must have been executed
135 assert int(contents.Length) == len(contents.stream)
136 if not allow_compressed:
137 assert len([x for x in contents.iteritems()]) == 1
138
139 return _cache_xobj(contents, resources, mbox, bbox)
140
141
142def docxobj(pageinfo, doc=None, allow_compressed=True):
143 ''' docxobj creates and returns an actual Form XObject.
144 Can work standalone, or in conjunction with
145 the CacheXObj class (below).
146 '''
147 if not isinstance(pageinfo, ViewInfo):
148 pageinfo = ViewInfo(pageinfo)
149
150 # If we're explicitly passed a document,
151 # make sure we don't have one implicitly as well.
152 # If no implicit or explicit doc, then read one in
153 # from the filename.
154 if doc is not None:
155 assert pageinfo.doc is None
156 pageinfo.doc = doc
157 elif pageinfo.doc is not None:
158 doc = pageinfo.doc
159 else:
160 doc = pageinfo.doc = PdfReader(pageinfo.docname, decompress = not allow_compressed)
161 assert isinstance(doc, PdfReader)
162
163 sourcepage = doc.pages[(pageinfo.page or 1) - 1]
164 return pagexobj(sourcepage, pageinfo, allow_compressed)
165
166
167class CacheXObj(object):
168 ''' Use to keep from reparsing files over and over,
169 and to keep from making the output too much
170 bigger than it ought to be by replicating
171 unnecessary object copies.
172 '''
173 def __init__(self, decompress=False):
174 ''' Set decompress true if you need
175 the Form XObjects to be decompressed.
176 Will decompress what it can and scream
177 about the rest.
178 '''
179 self.cached_pdfs = {}
180 self.decompress = decompress
181
182 def load(self, sourcename):
183 ''' Load a Form XObject from a uri
184 '''
185 info = ViewInfo(sourcename)
186 fname = info.docname
187 pcache = self.cached_pdfs
188 doc = pcache.get(fname)
189 if doc is None:
190 doc = pcache[fname] = PdfReader(fname, decompress=self.decompress)
191 return docxobj(info, doc, allow_compressed=not self.decompress)
diff --git a/lib/pdfrw/pdfcompress.py b/lib/pdfrw/pdfcompress.py
new file mode 100644
index 0000000..1c11970
--- /dev/null
+++ b/lib/pdfrw/pdfcompress.py
@@ -0,0 +1,57 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6Currently, this sad little file only knows how to decompress
7using the flate (zlib) algorithm. Maybe more later, but it's
8not a priority for me...
9'''
10
11from __future__ import generators
12
13try:
14 set
15except NameError:
16 from sets import Set as set
17
18import zlib
19from pdfobjects import PdfDict, PdfName
20
21
22def streamobjects(mylist):
23 for obj in mylist:
24 if isinstance(obj, PdfDict) and obj.stream is not None:
25 yield obj
26
27def uncompress(mylist, warnings=set()):
28 flate = PdfName.FlateDecode
29 for obj in streamobjects(mylist):
30 ftype = obj.Filter
31 if ftype is None:
32 continue
33 if isinstance(ftype, list) and len(ftype) == 1:
34 # todo: multiple filters
35 ftype = ftype[0]
36 parms = obj.DecodeParms
37 if ftype != flate or parms is not None:
38 msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms))
39 if msg not in warnings:
40 warnings.add(msg)
41 print msg
42 else:
43 obj.stream = zlib.decompress(obj.stream)
44 obj.Filter = None
45
46def compress(mylist):
47 flate = PdfName.FlateDecode
48 for obj in streamobjects(mylist):
49 ftype = obj.Filter
50 if ftype is not None:
51 continue
52 oldstr = obj.stream
53 newstr = zlib.compress(oldstr)
54 if len(newstr) < len(oldstr) + 30:
55 obj.stream = newstr
56 obj.Filter = flate
57 obj.DecodeParms = None
diff --git a/lib/pdfrw/pdfobjects.py b/lib/pdfrw/pdfobjects.py
new file mode 100644
index 0000000..08ad825
--- /dev/null
+++ b/lib/pdfrw/pdfobjects.py
@@ -0,0 +1,183 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6Objects that can occur in PDF files. The most important
7objects are arrays and dicts. Either of these can be
8indirect or not, and dicts could have an associated
9stream.
10'''
11from __future__ import generators
12
13try:
14 set
15except NameError:
16 from sets import Set as set
17
18import re
19
20class PdfObject(str):
21 indirect = False
22
23class PdfArray(list):
24 indirect = False
25
26class PdfName(object):
27 def __getattr__(self, name):
28 return self(name)
29 def __call__(self, name):
30 return PdfObject('/' + name)
31
32PdfName = PdfName()
33
34class PdfString(str):
35 indirect = False
36 unescape_dict = {'\\b':'\b', '\\f':'\f', '\\n':'\n',
37 '\\r':'\r', '\\t':'\t',
38 '\\\r\n': '', '\\\r':'', '\\\n':'',
39 '\\\\':'\\', '\\':'',
40 }
41 unescape_pattern = r'(\\b|\\f|\\n|\\r|\\t|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)'
42 unescape_func = re.compile(unescape_pattern).split
43
44 hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
45 hex_func = re.compile(hex_pattern).split
46
47 hex_pattern2 = '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
48 hex_func2 = re.compile(hex_pattern2).split
49
50 hex_funcs = hex_func, hex_func2
51
52 indirect = False
53
54 def decode_regular(self, remap=chr):
55 assert self[0] == '(' and self[-1] == ')'
56 mylist = self.unescape_func(self[1:-1])
57 result = []
58 unescape = self.unescape_dict.get
59 for chunk in mylist:
60 chunk = unescape(chunk, chunk)
61 if chunk.startswith('\\') and len(chunk) > 1:
62 value = int(chunk[1:], 8)
63 # FIXME: TODO: Handle unicode here
64 if value > 127:
65 value = 127
66 chunk = remap(value)
67 if chunk:
68 result.append(chunk)
69 return ''.join(result)
70
71 def decode_hex(self, remap=chr, twobytes=False):
72 data = ''.join(self.split())
73 data = self.hex_funcs[twobytes](data)
74 chars = data[1::2]
75 other = data[0::2]
76 assert other[0] == '<' and other[-1] == '>' and ''.join(other) == '<>', self
77 return ''.join([remap(int(x, 16)) for x in chars])
78
79 def decode(self, remap=chr, twobytes=False):
80 if self.startswith('('):
81 return self.decode_regular(remap)
82
83 else:
84 return self.decode_hex(remap, twobytes)
85
86 def encode(cls, source, usehex=False):
87 assert not usehex, "Not supported yet"
88 if isinstance(source, unicode):
89 source = source.encode('utf-8')
90 else:
91 source = str(source)
92 source = source.replace('\\', '\\\\')
93 source = source.replace('(', '\\(')
94 source = source.replace(')', '\\)')
95 return cls('(' +source + ')')
96 encode = classmethod(encode)
97
98class PdfDict(dict):
99 indirect = False
100 stream = None
101
102 _special = dict(indirect = ('indirect', False),
103 stream = ('stream', True),
104 _stream = ('stream', False),
105 )
106
107 def __setitem__(self, name, value):
108 assert name.startswith('/'), name
109 if value is not None:
110 dict.__setitem__(self, name, value)
111 elif name in self:
112 del self[name]
113
114 def __init__(self, *args, **kw):
115 if args:
116 if len(args) == 1:
117 args = args[0]
118 self.update(args)
119 if isinstance(args, PdfDict):
120 self.indirect = args.indirect
121 self._stream = args.stream
122 for key, value in kw.iteritems():
123 setattr(self, key, value)
124
125 def __getattr__(self, name):
126 return self.get(PdfName(name))
127
128 def __setattr__(self, name, value):
129 info = self._special.get(name)
130 if info is None:
131 self[PdfName(name)] = value
132 else:
133 name, setlen = info
134 self.__dict__[name] = value
135 if setlen:
136 notnone = value is not None
137 self.Length = notnone and PdfObject(len(value)) or None
138
139 def iteritems(self):
140 for key, value in dict.iteritems(self):
141 if value is not None:
142 assert key.startswith('/'), (key, value)
143 yield key, value
144
145 def inheritable(self):
146 ''' Search through ancestors as needed for inheritable
147 dictionary items
148 '''
149 class Search(object):
150 def __init__(self, basedict):
151 self.basedict = basedict
152 def __getattr__(self, name):
153 return self[name]
154 def __getitem__(self, name):
155 visited = set()
156 mydict = self.basedict
157 while 1:
158 value = getattr(mydict, name)
159 if value is not None:
160 return value
161 myid = id(mydict)
162 assert myid not in visited
163 visited.add(myid)
164 mydict = mydict.Parent
165 if mydict is None:
166 return
167 return Search(self)
168 inheritable = property(inheritable)
169
170 def private(self):
171 ''' Allows setting private metadata for use in
172 processing (not sent to PDF file)
173 '''
174 class Private(object):
175 pass
176
177 result = Private()
178 result.__dict__ = self.__dict__
179 return result
180 private = property(private)
181
182class IndirectPdfDict(PdfDict):
183 indirect = True
diff --git a/lib/pdfrw/pdfreader.py b/lib/pdfrw/pdfreader.py
new file mode 100644
index 0000000..6f57bea
--- /dev/null
+++ b/lib/pdfrw/pdfreader.py
@@ -0,0 +1,213 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6The PdfReader class reads an entire PDF file into memory and
7parses the top-level container objects. (It does not parse
8into streams.) The object subclasses PdfDict, and the
9document pages are stored in a list in the pages attribute
10of the object.
11'''
12
13from pdftokens import PdfTokens
14from pdfobjects import PdfDict, PdfArray, PdfName
15from pdfcompress import uncompress
16
17class PdfReader(PdfDict):
18
19 class unresolved:
20 # Used as a placeholder until we have an object.
21 pass
22
23 def readindirect(self, objnum, gennum):
24 ''' Read an indirect object. If it has already
25 been read, return it from the cache.
26 '''
27
28 def setobj(obj):
29 # Store the new object in the dictionary
30 # once we have its value
31 record[1] = obj
32
33 def ordinary(source, setobj, obj):
34 # Deal with an ordinary (non-array, non-dict) object
35 setobj(obj)
36 return obj
37
38 fdata, objnum, gennum = self.fdata, int(objnum), int(gennum)
39 record = self.indirect_objects[fdata, objnum, gennum]
40 if record[1] is not self.unresolved:
41 return record[1]
42
43 # Read the object header and validate it
44 source = PdfTokens(fdata, record[0])
45 objid = source.multiple(3)
46 assert int(objid[0]) == objnum, objid
47 assert int(objid[1]) == gennum, objid
48 assert objid[2] == 'obj', objid
49
50 # Read the object, and call special code if it starts
51 # an array or dictionary
52 obj = source.next()
53 obj = self.special.get(obj, ordinary)(source, setobj, obj)
54 self.readstream(obj, source)
55 obj.indirect = True
56 return obj
57
58 def readstream(obj, source):
59 ''' Read optional stream following a dictionary
60 object.
61 '''
62 tok = source.next()
63 if tok == 'endobj':
64 return # No stream
65
66 assert isinstance(obj, PdfDict)
67 assert tok == 'stream', tok
68 fdata = source.fdata
69 floc = fdata.rindex(tok, 0, source.floc) + len(tok)
70 ch = fdata[floc]
71 if ch == '\r':
72 floc += 1
73 ch = fdata[floc]
74 assert ch == '\n'
75 startstream = floc + 1
76 endstream = startstream + int(obj.Length)
77 obj._stream = fdata[startstream:endstream]
78 source = PdfTokens(fdata, endstream)
79 endit = source.multiple(2)
80 if endit != 'endstream endobj'.split():
81 # /Length attribute is broken, try to read stream
82 # anyway disregarding the specified value
83 # TODO: issue warning here once we have some kind of
84 # logging
85 endstream = fdata.index('endstream', startstream)
86 if fdata[endstream-2:endstream] == '\r\n':
87 endstream -= 2
88 elif fdata[endstream-1] in ['\n', '\r']:
89 endstream -= 1
90 source = PdfTokens(fdata, endstream)
91 endit = source.multiple(2)
92 assert endit == 'endstream endobj'.split()
93 obj.Length = str(endstream-startstream)
94 obj._stream = fdata[startstream:endstream]
95 readstream = staticmethod(readstream)
96
97 def readarray(self, source, setobj=lambda x:None, original=None):
98 special = self.special
99 result = PdfArray()
100 setobj(result)
101
102 for value in source:
103 if value == ']':
104 break
105 if value in special:
106 value = special[value](source)
107 elif value == 'R':
108 generation = result.pop()
109 value = self.readindirect(result.pop(), generation)
110 result.append(value)
111 return result
112
113 def readdict(self, source, setobj=lambda x:None, original=None):
114 special = self.special
115 result = PdfDict()
116 setobj(result)
117
118 tok = source.next()
119 while tok != '>>':
120 assert tok.startswith('/'), (tok, source.multiple(10))
121 key = tok
122 value = source.next()
123 if value in special:
124 value = special[value](source)
125 tok = source.next()
126 else:
127 tok = source.next()
128 if value.isdigit() and tok.isdigit():
129 assert source.next() == 'R'
130 value = self.readindirect(value, tok)
131 tok = source.next()
132 result[key] = value
133
134 return result
135
136 def readxref(fdata):
137 startloc = fdata.rindex('startxref')
138 xrefinfo = list(PdfTokens(fdata, startloc, False))
139 assert len(xrefinfo) == 3, xrefinfo
140 assert xrefinfo[0] == 'startxref', xrefinfo[0]
141 assert xrefinfo[1].isdigit(), xrefinfo[1]
142 assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2])
143 return startloc, PdfTokens(fdata, int(xrefinfo[1]))
144 readxref = staticmethod(readxref)
145
146 def parsexref(self, source):
147 tok = source.next()
148 assert tok == 'xref', tok
149 while 1:
150 tok = source.next()
151 if tok == 'trailer':
152 break
153 startobj = int(tok)
154 for objnum in range(startobj, startobj + int(source.next())):
155 offset = int(source.next())
156 generation = int(source.next())
157 if source.next() == 'n':
158 objid = self.fdata, objnum, generation
159 objval = [offset, self.unresolved]
160 self.indirect_objects.setdefault(objid, objval)
161
162 pagename = PdfName.Page
163 pagesname = PdfName.Pages
164
165 def readpages(self, node):
166 # PDFs can have arbitrarily nested Pages/Page
167 # dictionary structures.
168 if node.Type == self.pagename:
169 return [node]
170 assert node.Type == self.pagesname, node.Type
171 result = []
172 for node in node.Kids:
173 result.extend(self.readpages(node))
174 return result
175
176 def __init__(self, fname=None, fdata=None, decompress=True):
177
178 if fname is not None:
179 assert fdata is None
180 # Allow reading preexisting streams like pyPdf
181 if hasattr(fname, 'read'):
182 fdata = fname.read()
183 else:
184 f = open(fname, 'rb')
185 fdata = f.read()
186 f.close()
187
188 assert fdata is not None
189 fdata = fdata.rstrip('\00')
190 self.private.fdata = fdata
191
192 self.private.indirect_objects = {}
193 self.private.special = {'<<': self.readdict, '[': self.readarray}
194
195 startloc, source = self.readxref(fdata)
196 self.parsexref(source)
197 assert source.next() == '<<'
198 self.update(self.readdict(source))
199 assert source.next() == 'startxref' and source.floc > startloc
200 self.private.pages = self.readpages(self.Root.Pages)
201 if decompress:
202 self.uncompress()
203
204 # For compatibility with pyPdf
205 self.private.numPages = len(self.pages)
206
207
208 # For compatibility with pyPdf
209 def getPage(self, pagenum):
210 return self.pages[pagenum]
211
212 def uncompress(self):
213 uncompress([x[1] for x in self.indirect_objects.itervalues()])
diff --git a/lib/pdfrw/pdftokens.py b/lib/pdfrw/pdftokens.py
new file mode 100644
index 0000000..04bd559
--- /dev/null
+++ b/lib/pdfrw/pdftokens.py
@@ -0,0 +1,249 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6A tokenizer for PDF streams.
7
8In general, documentation used was "PDF reference",
9sixth edition, for PDF version 1.7, dated November 2006.
10
11'''
12
13from __future__ import generators
14
15try:
16 set
17except NameError:
18 from sets import Set as set
19
20import re
21from pdfobjects import PdfString, PdfObject
22
23class _PrimitiveTokens(object):
24
25 # Table 3.1, page 50 of reference, defines whitespace
26 whitespaceset = set('\x00\t\n\f\r ')
27
28
29 # Text on page 50 defines delimiter characters
30 delimiterset = set('()<>{}[]/%')
31
32 # Coalesce contiguous whitespace into a single token
33 whitespace_pattern = '[%s]+' % ''.join(whitespaceset)
34
35 # In addition to the delimiters, we also use '\', which
36 # is special in some contexts in PDF.
37 delimiter_pattern = '\\\\|\\' + '|\\'.join(delimiterset)
38
39 # Dictionary delimiters are '<<' and '>>'. Look for
40 # these before the single variety.
41 dictdelim_pattern = r'\<\<|\>\>'
42
43 pattern = '(%s|%s|%s)' % (whitespace_pattern,
44 dictdelim_pattern, delimiter_pattern)
45 re_func = re.compile(pattern).finditer
46 del whitespace_pattern, dictdelim_pattern
47 del delimiter_pattern, pattern
48
49 def __init__(self, fdata):
50
51 class MyIterator(object):
52 def next():
53 if not tokens:
54 startloc = self.startloc
55 for match in next_match[0]:
56 start = match.start()
57 end = match.end()
58 tappend(fdata[start:end])
59 if start > startloc:
60 tappend(fdata[startloc:start])
61 self.startloc = end
62 break
63 else:
64 s = fdata[startloc:]
65 self.startloc = len(fdata)
66 if s:
67 tappend(s)
68 if not tokens:
69 raise StopIteration
70 return tpop()
71 next = staticmethod(next)
72
73 self.fdata = fdata
74 self.tokens = tokens = []
75 self.iterator = iterator = MyIterator()
76 self.next = iterator.next
77 self.next_match = next_match = [None]
78 tappend = tokens.append
79 tpop = tokens.pop
80
81 def setstart(self, startloc):
82 self.startloc = startloc
83 self.next_match[0] = self.re_func(self.fdata, startloc)
84
85 def __iter__(self):
86 return self.iterator
87
88 def coalesce(self, result):
89 ''' This function coalesces tokens together up until
90 the next delimiter or whitespace.
91 All of the coalesced tokens will either be non-matches,
92 or will be a matched backslash. We distinguish the
93 non-matches by the fact that next() will have left
94 a following match inside self.tokens for the actual match.
95 '''
96 tokens = self.tokens
97 whitespace = self.whitespaceset
98
99 # Optimized path for usual case -- regular data (not a name string),
100 # with no escape character, and followed by whitespace.
101
102 if tokens:
103 token = tokens.pop()
104 if token != '\\':
105 if token[0] not in whitespace:
106 tokens.append(token)
107 return
108 result.append(token)
109
110 # Non-optimized path. Either start of a name string received,
111 # or we just had one escape.
112
113 for token in self:
114 if tokens:
115 result.append(token)
116 token = tokens.pop()
117 if token != '\\':
118 if token[0] not in whitespace:
119 tokens.append(token)
120 return
121 result.append(token)
122
123
124 def floc(self):
125 return self.startloc - sum([len(x) for x in self.tokens])
126
127class PdfTokens(object):
128
129 def __init__(self, fdata, startloc=0, strip_comments=True):
130
131 def comment(token):
132 tokens = [token]
133 for token in primitive:
134 tokens.append(token)
135 if token[0] in whitespaceset and ('\n' in token or '\r' in token):
136 break
137 return not strip_comments and ''.join(tokens)
138
139 def single(token):
140 return token
141
142 def regular_string(token):
143 def escaped():
144 escaped = False
145 i = -2
146 while tokens[i] == '\\':
147 escaped = not escaped
148 i -= 1
149 return escaped
150
151 tokens = [token]
152 nestlevel = 1
153 for token in primitive:
154 tokens.append(token)
155 if token in '()' and not escaped():
156 nestlevel += token == '(' or -1
157 if not nestlevel:
158 break
159 else:
160 assert 0, "Unexpected end of token stream"
161 return PdfString(''.join(tokens))
162
163 def hex_string(token):
164 tokens = [token]
165 for token in primitive:
166 tokens.append(token)
167 if token == '>':
168 break
169 while tokens[-2] == '>>':
170 tokens.append(tokens.pop(-2))
171 return PdfString(''.join(tokens))
172
173 def normal_data(token):
174
175 # Obscure optimization -- we can get here with
176 # whitespace or regular character data. If we get
177 # here with whitespace, then there won't be an additional
178 # token queued up in the primitive object, otherwise there
179 # will...
180 if primitive_tokens: #if token[0] not in whitespaceset:
181 tokens = [token]
182 primitive.coalesce(tokens)
183 return PdfObject(''.join(tokens))
184
185 def name_string(token):
186 tokens = [token]
187 primitive.coalesce(tokens)
188 token = ''.join(tokens)
189 if '#' in token:
190 substrs = token.split('#')
191 substrs.reverse()
192 tokens = [substrs.pop()]
193 while substrs:
194 s = substrs.pop()
195 tokens.append(chr(int(s[:2], 16)))
196 tokens.append(s[2:])
197 token = ''.join(tokens)
198 return PdfObject(token)
199
200 def broken(token):
201 assert 0, token
202
203 dispatch = {
204 '(': regular_string,
205 ')': broken,
206 '<': hex_string,
207 '>': broken,
208 '[': single,
209 ']': single,
210 '{': single,
211 '}': single,
212 '/': name_string,
213 '%' : comment,
214 '<<': single,
215 '>>': single,
216 }.get
217
218 class MyIterator(object):
219 def next():
220 while not tokens:
221 token = primitive_next()
222 token = dispatch(token, normal_data)(token)
223 if token:
224 return token
225 return tokens.pop()
226 next = staticmethod(next)
227
228 self.primitive = primitive = _PrimitiveTokens(fdata)
229 self.setstart = primitive.setstart
230 primitive.setstart(startloc)
231 self.fdata = fdata
232 self.strip_comments = strip_comments
233 self.tokens = tokens = []
234 self.iterator = iterator = MyIterator()
235 self.next = iterator.next
236 primitive_next = primitive.next
237 primitive_tokens = primitive.tokens
238 whitespaceset = _PrimitiveTokens.whitespaceset
239
240 def floc(self):
241 return self.primitive.floc() - sum([len(x) for x in self.tokens])
242 floc = property(floc)
243
244 def __iter__(self):
245 return self.iterator
246
247 def multiple(self, count):
248 next = self.next
249 return [next() for i in range(count)]
diff --git a/lib/pdfrw/pdfwriter.py b/lib/pdfrw/pdfwriter.py
new file mode 100755
index 0000000..c193843
--- /dev/null
+++ b/lib/pdfrw/pdfwriter.py
@@ -0,0 +1,234 @@
1#!/usr/bin/env python
2
3# A part of pdfrw (pdfrw.googlecode.com)
4# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
5# MIT license -- See LICENSE.txt for details
6
7'''
8The PdfWriter class writes an entire PDF file out to disk.
9
10The writing process is not at all optimized or organized.
11
12An instance of the PdfWriter class has two methods:
13 addpage(page)
14and
15 write(fname)
16
17addpage() assumes that the pages are part of a valid
18tree/forest of PDF objects.
19'''
20
21try:
22 set
23except NameError:
24 from sets import Set as set
25
26from pdfobjects import PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfString
27from pdfcompress import compress
28
29debug = False
30
31class FormatObjects(object):
32 ''' FormatObjects performs the actual formatting and disk write.
33 '''
34
35 def add(self, obj, visited):
36 ''' Add an object to our list, if it's an indirect
37 object. Just format it if not.
38 '''
39 # Can't hash dicts, so just hash the object ID
40 objid = id(obj)
41
42 # Automatically set stream objects to indirect
43 if isinstance(obj, PdfDict):
44 indirect = obj.indirect or (obj.stream is not None)
45 else:
46 indirect = getattr(obj, 'indirect', False)
47
48 if not indirect:
49 assert objid not in visited, \
50 'Circular reference encountered in non-indirect object %s' % repr(obj)
51 visited.add(objid)
52 result = self.format_obj(obj, visited)
53 visited.remove(objid)
54 return result
55
56 objnum = self.indirect_dict.get(objid)
57
58 # If we haven't seen the object yet, we need to
59 # add it to the indirect object list.
60 if objnum is None:
61 objlist = self.objlist
62 objnum = len(objlist) + 1
63 if debug:
64 print ' Object', objnum, '\r',
65 objlist.append(None)
66 self.indirect_dict[objid] = objnum
67 objlist[objnum-1] = self.format_obj(obj)
68 return '%s 0 R' % objnum
69
70 def format_array(myarray, formatter):
71 # Format array data into semi-readable ASCII
72 if sum([len(x) for x in myarray]) <= 70:
73 return formatter % ' '.join(myarray)
74 bigarray = []
75 count = 1000000
76 for x in myarray:
77 lenx = len(x)
78 if lenx + count > 70:
79 subarray = []
80 bigarray.append(subarray)
81 count = 0
82 count += lenx + 1
83 subarray.append(x)
84 return formatter % '\n '.join([' '.join(x) for x in bigarray])
85 format_array = staticmethod(format_array)
86
87 def format_obj(self, obj, visited=None):
88 ''' format PDF object data into semi-readable ASCII.
89 May mutually recurse with add() -- add() will
90 return references for indirect objects, and add
91 the indirect object to the list.
92 '''
93 if visited is None:
94 visited = set()
95 if isinstance(obj, PdfArray):
96 myarray = [self.add(x, visited) for x in obj]
97 return self.format_array(myarray, '[%s]')
98 elif isinstance(obj, PdfDict):
99 if self.compress and obj.stream:
100 compress([obj])
101 myarray = []
102 # Jython 2.2.1 has a bug which segfaults when
103 # sorting subclassed strings, so we un-subclass them.
104 dictkeys = [str(x) for x in obj.iterkeys()]
105 dictkeys.sort()
106 for key in dictkeys:
107 myarray.append(key)
108 myarray.append(self.add(obj[key], visited))
109 result = self.format_array(myarray, '<<%s>>')
110 stream = obj.stream
111 if stream is not None:
112 result = '%s\nstream\n%s\nendstream' % (result, stream)
113 return result
114 elif isinstance(obj, basestring) and not hasattr(obj, 'indirect'):
115 return PdfString.encode(obj)
116 else:
117 return str(obj)
118
119 def dump(cls, f, trailer, version='1.3', compress=True):
120 self = cls()
121 self.compress = compress
122 self.indirect_dict = {}
123 self.objlist = []
124
125 # The first format of trailer gets all the information,
126 # but we throw away the actual trailer formatting.
127 self.format_obj(trailer)
128 # Now we know the size, so we update the trailer dict
129 # and get the formatted data.
130 trailer.Size = PdfObject(len(self.objlist) + 1)
131 trailer = self.format_obj(trailer)
132
133 # Now we have all the pieces to write out to the file.
134 # Keep careful track of the counts while we do it so
135 # we can correctly build the cross-reference.
136
137 header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version
138 f.write(header)
139 offset = len(header)
140 offsets = [(0, 65535, 'f')]
141
142 for i, x in enumerate(self.objlist):
143 objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x)
144 offsets.append((offset, 0, 'n'))
145 offset += len(objstr)
146 f.write(objstr)
147
148 f.write('xref\n0 %s\n' % len(offsets))
149 for x in offsets:
150 f.write('%010d %05d %s\r\n' % x)
151 f.write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset))
152 dump = classmethod(dump)
153
154class PdfWriter(object):
155
156 _trailer = None
157
158 def __init__(self, version='1.3', compress=True):
159 self.pagearray = PdfArray()
160 self.compress = compress
161 self.version = version
162
163 def addpage(self, page):
164 self._trailer = None
165 assert page.Type == PdfName.Page
166 inheritable = page.inheritable # searches for resources
167 self.pagearray.append(
168 IndirectPdfDict(
169 page,
170 Resources = inheritable.Resources,
171 MediaBox = inheritable.MediaBox,
172 CropBox = inheritable.CropBox,
173 Rotate = inheritable.Rotate,
174 )
175 )
176 return self
177
178 addPage = addpage # for compatibility with pyPdf
179
180 def addpages(self, pagelist):
181 for page in pagelist:
182 self.addpage(page)
183 return self
184
185 def _get_trailer(self):
186 trailer = self._trailer
187 if trailer is not None:
188 return trailer
189
190 # Create the basic object structure of the PDF file
191 trailer = PdfDict(
192 Root = IndirectPdfDict(
193 Type = PdfName.Catalog,
194 Pages = IndirectPdfDict(
195 Type = PdfName.Pages,
196 Count = PdfObject(len(self.pagearray)),
197 Kids = self.pagearray
198 )
199 )
200 )
201 # Make all the pages point back to the page dictionary
202 pagedict = trailer.Root.Pages
203 for page in pagedict.Kids:
204 page.Parent = pagedict
205 self._trailer = trailer
206 return trailer
207
208 def _set_trailer(self, trailer):
209 self._trailer = trailer
210
211 trailer = property(_get_trailer, _set_trailer)
212
213 def write(self, fname, trailer=None):
214 trailer = trailer or self.trailer
215
216 # Dump the data. We either have a filename or a preexisting
217 # file object.
218 preexisting = hasattr(fname, 'write')
219 f = preexisting and fname or open(fname, 'wb')
220 FormatObjects.dump(f, trailer, self.version, self.compress)
221 if not preexisting:
222 f.close()
223
224if __name__ == '__main__':
225 debug = True
226 import pdfreader
227 x = pdfreader.PdfReader('source.pdf')
228 y = PdfWriter()
229 for i, page in enumerate(x.pages):
230 print ' Adding page', i+1, '\r',
231 y.addpage(page)
232 print
233 y.write('result.pdf')
234 print
diff --git a/lib/pdfrw/toreportlab.py b/lib/pdfrw/toreportlab.py
new file mode 100644
index 0000000..00ad324
--- /dev/null
+++ b/lib/pdfrw/toreportlab.py
@@ -0,0 +1,139 @@
1# A part of pdfrw (pdfrw.googlecode.com)
2# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
4
5'''
6Converts pdfrw objects into reportlab objects.
7
8Designed for and tested with rl 2.3.
9
10Knows too much about reportlab internals.
11What can you do?
12
13The interface to this function is through the makerl() function.
14
15Parameters:
16 canv - a reportlab "canvas" (also accepts a "document")
17 pdfobj - a pdfrw PDF object
18
19Returns:
20 A corresponding reportlab object, or if the
21 object is a PDF Form XObject, the name to
22 use with reportlab for the object.
23
24 Will recursively convert all necessary objects.
25 Be careful when converting a page -- if /Parent is set,
26 will recursively convert all pages!
27
28Notes:
29 1) Original objects are annotated with a
30 derived_rl_obj attribute which points to the
31 reportlab object. This keeps multiple reportlab
32 objects from being generated for the same pdfobj
33 via repeated calls to makerl. This is great for
34 not putting too many objects into the
35 new PDF, but not so good if you are modifying
36 objects for different pages. Then you
37 need to do your own deep copying (of circular
38 structures). You're on your own.
39
40 2) ReportLab seems weird about FormXObjects.
41 They pass around a partial name instead of the
42 object or a reference to it. So we have to
43 reach into reportlab and get a number for
44 a unique name. I guess this is to make it
45 where you can combine page streams with
46 impunity, but that's just a guess.
47
48 3) Updated 1/23/2010 to handle multipass documents
49 (e.g. with a table of contents). These have
50 a different doc object on every pass.
51
52'''
53
54from reportlab.pdfbase import pdfdoc as rldocmodule
55from pdfobjects import PdfDict, PdfArray, PdfName
56
57RLStream = rldocmodule.PDFStream
58RLDict = rldocmodule.PDFDictionary
59RLArray = rldocmodule.PDFArray
60
61
62def _makedict(rldoc, pdfobj):
63 rlobj = rldict = RLDict()
64 if pdfobj.indirect:
65 rlobj.__RefOnly__ = 1
66 rlobj = rldoc.Reference(rlobj)
67 pdfobj.derived_rl_obj[rldoc] = rlobj, None
68
69 for key, value in pdfobj.iteritems():
70 rldict[key[1:]] = makerl_recurse(rldoc, value)
71
72 return rlobj
73
74def _makestream(rldoc, pdfobj, xobjtype=PdfName.XObject):
75 rldict = RLDict()
76 rlobj = RLStream(rldict, pdfobj.stream)
77
78 if pdfobj.Type == xobjtype:
79 shortname = 'pdfrw_%s' % (rldoc.objectcounter+1)
80 fullname = rldoc.getXObjectName(shortname)
81 else:
82 shortname = fullname = None
83 result = rldoc.Reference(rlobj, fullname)
84 pdfobj.derived_rl_obj[rldoc] = result, shortname
85
86 for key, value in pdfobj.iteritems():
87 rldict[key[1:]] = makerl_recurse(rldoc, value)
88
89 return result
90
91def _makearray(rldoc, pdfobj):
92 rlobj = rlarray = RLArray([])
93 if pdfobj.indirect:
94 rlobj.__RefOnly__ = 1
95 rlobj = rldoc.Reference(rlobj)
96 pdfobj.derived_rl_obj[rldoc] = rlobj, None
97
98 mylist = rlarray.sequence
99 for value in pdfobj:
100 mylist.append(makerl_recurse(rldoc, value))
101
102 return rlobj
103
104def _makestr(rldoc, pdfobj):
105 assert isinstance(pdfobj, (float, int, str)), repr(pdfobj)
106 return pdfobj
107
108def makerl_recurse(rldoc, pdfobj):
109 docdict = getattr(pdfobj, 'derived_rl_obj', None)
110 if docdict is not None:
111 value = docdict.get(rldoc)
112 if value is not None:
113 return value[0]
114 if isinstance(pdfobj, PdfDict):
115 if pdfobj.stream is not None:
116 func = _makestream
117 else:
118 func = _makedict
119 if docdict is None:
120 pdfobj.private.derived_rl_obj = {}
121 elif isinstance(pdfobj, PdfArray):
122 func = _makearray
123 if docdict is None:
124 pdfobj.derived_rl_obj = {}
125 else:
126 func = _makestr
127 return func(rldoc, pdfobj)
128
129def makerl(canv, pdfobj):
130 try:
131 rldoc = canv._doc
132 except AttributeError:
133 rldoc = canv
134 rlobj = makerl_recurse(rldoc, pdfobj)
135 try:
136 name = pdfobj.derived_rl_obj[rldoc][1]
137 except AttributeError:
138 name = None
139 return name or rlobj
diff --git a/lib/sounds.py b/lib/sounds.py
new file mode 100644
index 0000000..a4bf5b6
--- /dev/null
+++ b/lib/sounds.py
@@ -0,0 +1 @@
import parser