setup.py now works !

author: jvoisin 2011-08-16 18:11:24 +0200
committer: jvoisin 2011-08-16 18:11:24 +0200
commit: 4bd3e47da02fde08acfada1795cc55170abdb00a (patch)
tree: f8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /lib/office.py
parent: baf8e080125614326ba9c96ca8f2404fd12b050e (diff)
1 files changed, 0 insertions, 257 deletions
diff --git a/lib/office.py b/lib/office.py
deleted file mode 100644
index 33af48e..0000000
--- a/lib/office.py
+++ /dev/null
@@ -1,257 +0,0 @@
-'''
-    Care about office's formats
-'''
-import os
-import logging
-import zipfile
-import fileinput
-try:
-    import cairo
-    import poppler
-except ImportError:
-    pass
-import mat
-import parser
-import archive
-import pdfrw
-class OpenDocumentStripper(archive.GenericArchiveStripper):
-    '''
-        An open document file is a zip, with xml file into.
-        The one that interest us is meta.xml
-    '''
-    def get_meta(self):
-        '''
-            Return a dict with all the meta of the file by
-            trying to read the meta.xml file.
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        metadata = {}
-        try:
-            content = zipin.read('meta.xml')
-            zipin.close()
-            metadata[self.filename] = 'harful meta'
-        except KeyError:  # no meta.xml file found
-            logging.debug('%s has no opendocument metadata' % self.filename)
-        return metadata
-    def _remove_all(self, method):
-        '''
-            FIXME ?
-            There is a patch implementing the Zipfile.remove()
-            method here : http://bugs.python.org/issue6818
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
-        for item in zipin.namelist():
-            name = os.path.join(self.tempdir, item)
-            _, ext = os.path.splitext(name)
-            if item.endswith('manifest.xml'):
-            # contain the list of all files present in the archive
-                zipin.extract(item, self.tempdir)
-                for line in fileinput.input(name, inplace=1):
-                    #remove the line which contains "meta.xml"
-                    line = line.strip()
-                    if not 'meta.xml' in line:
-                        print line
-                zipout.write(name, item)
-            elif ext in parser.NOMETA or item == 'mimetype':
-                #keep NOMETA files, and the "manifest" file
-                if item != 'meta.xml':  # contains the metadata
-                    zipin.extract(item, self.tempdir)
-                    zipout.write(name, item)
-            else:
-                zipin.extract(item, self.tempdir)
-                if os.path.isfile(name):
-                    try:
-                        cfile = mat.create_class_file(name, False,
-                            self.add2archive)
-                        if method == 'normal':
-                            cfile.remove_all()
-                        else:
-                            cfile.remove_all_ugly()
-                        logging.debug('Processing %s from %s' % (item,
-                            self.filename))
-                        zipout.write(name, item)
-                    except:
-                        logging.info('%s\' fileformat is not supported' % item)
-                        if self.add2archive:
-                            zipout.write(name, item)
-        zipout.comment = ''
-        logging.info('%s treated' % self.filename)
-        zipin.close()
-        zipout.close()
-        self.do_backup()
-    def is_clean(self):
-        '''
-            Check if the file is clean from harmful metadatas
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        try:
-            zipin.getinfo('meta.xml')
-        except KeyError:  # no meta.xml in the file
-            czf = archive.ZipStripper(self.filename, self.parser,
-                'application/zip', self.backup, self.add2archive)
-            if czf.is_clean():
-                zipin.close()
-                return True
-        zipin.close()
-        return False
-class PdfStripper(parser.GenericParser):
-    '''
-        Represent a pdf file
-    '''
-    def __init__(self, filename, parser, mime, backup, add2archive):
-        super(PdfStripper, self).__init__(filename, parser, mime, backup,
-            add2archive)
-        uri = 'file://' + os.path.abspath(self.filename)
-        self.password = None
-        self.document = poppler.document_new_from_file(uri, self.password)
-        self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
-            'producer', 'creation-date', 'mod-date', 'metadata')
-    def is_clean(self):
-        '''
-            Check if the file is clean from harmful metadatas
-        '''
-        for key in self.meta_list:
-            if key == 'creation-date' or key == 'mod-date':
-                if self.document.get_property(key) != -1:
-                    return False
-            elif self.document.get_property(key) is not None and \
-                self.document.get_property(key) != '':
-                return False
-        return True
-    def remove_all(self):
-        '''
-            Opening the pdf with poppler, then doing a render
-            on a cairo pdfsurface for each pages.
-            Thanks to Lunar^for the idea.
-            http://cairographics.org/documentation/pycairo/2/
-            python-poppler is not documented at all : have fun ;)
-        '''
-        page = self.document.get_page(0)
-        page_width, page_height = page.get_size()
-        surface = cairo.PDFSurface(self.output, page_width, page_height)
-        context = cairo.Context(surface)  # context draws on the surface
-        logging.debug('Pdf rendering of %s' % self.filename)
-        for pagenum in xrange(self.document.get_n_pages()):
-            page = self.document.get_page(pagenum)
-            context.translate(0, 0)
-            page.render(context)  # render the page on context
-            context.show_page()  # draw context on surface
-        surface.finish()
-        #For now, poppler cannot write meta, so we must use pdfrw
-        logging.debug('Removing %s\'s superficial metadata' % self.filename)
-        trailer = pdfrw.PdfReader(self.output)
-        trailer.Info.Producer = trailer.Info.Creator = None
-        writer = pdfrw.PdfWriter()
-        writer.trailer = trailer
-        writer.write(self.output)
-        self.do_backup()
-    def get_meta(self):
-        '''
-            Return a dict with all the meta of the file
-        '''
-        metadata = {}
-        for key in self.meta_list:
-            if key == 'creation-date' or key == 'mod-date':
-                #creation and modification are set to -1
-                if self.document.get_property(key) != -1:
-                    metadata[key] = self.document.get_property(key)
-            elif self.document.get_property(key) is not None and \
-                self.document.get_property(key) != '':
-                metadata[key] = self.document.get_property(key)
-        return metadata
-class OpenXmlStripper(archive.GenericArchiveStripper):
-    '''
-        Represent an office openxml document, which is like
-        an opendocument format, with some tricky stuff added.
-        It contains mostly xml, but can have media blobs, crap, ...
-        (I don't like this format.)
-    '''
-    def _remove_all(self, method):
-        '''
-            FIXME ?
-            There is a patch implementing the Zipfile.remove()
-            method here : http://bugs.python.org/issue6818
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        zipout = zipfile.ZipFile(self.output, 'w',
-            allowZip64=True)
-        for item in zipin.namelist():
-            name = os.path.join(self.tempdir, item)
-            _, ext = os.path.splitext(name)
-            if item.startswith('docProps/'):  # metadatas
-                pass
-            elif ext in parser.NOMETA or item == '.rels':
-                #keep parser.NOMETA files, and the file named ".rels"
-                zipin.extract(item, self.tempdir)
-                zipout.write(name, item)
-            else:
-                zipin.extract(item, self.tempdir)
-                if os.path.isfile(name):  # don't care about folders
-                    try:
-                        cfile = mat.create_class_file(name, False,
-                            self.add2archive)
-                        if method == 'normal':
-                            cfile.remove_all()
-                        else:
-                            cfile.remove_all_ugly()
-                        logging.debug('Processing %s from %s' % (item,
-                            self.filename))
-                        zipout.write(name, item)
-                    except:
-                        logging.info('%s\' fileformat is not supported' % item)
-                        if self.add2archive:
-                            zipout.write(name, item)
-        zipout.comment = ''
-        logging.info('%s treated' % self.filename)
-        zipin.close()
-        zipout.close()
-        self.do_backup()
-    def is_clean(self):
-        '''
-            Check if the file is clean from harmful metadatas
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                return False
-        zipin.close()
-        czf = archive.ZipStripper(self.filename, self.parser,
-                'application/zip', self.backup, self.add2archive)
-        if not czf.is_clean():
-            return False
-        else:
-            return True
-    def get_meta(self):
-        '''
-            Return a dict with all the meta of the file
-        '''
-        zipin = zipfile.ZipFile(self.filename, 'r')
-        metadata = {}
-        for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                metadata[item] = 'harmful content'
-        zipin.close()
-        return metadata
author	jvoisin	2011-08-16 18:11:24 +0200
committer	jvoisin	2011-08-16 18:11:24 +0200
commit	4bd3e47da02fde08acfada1795cc55170abdb00a (patch)
tree	f8c7aa5fd5e1b07a28b350c5ded8125ef2467c51 /lib/office.py
parent	baf8e080125614326ba9c96ca8f2404fd12b050e (diff)

diff --git a/lib/office.py b/lib/office.py deleted file mode 100644 index 33af48e..0000000 --- a/lib/office.py +++ /dev/null
@@ -1,257 +0,0 @@
1	'''
2	Care about office's formats
3	'''
4
5	import os
6	import logging
7	import zipfile
8	import fileinput
9
10	try:
11	import cairo
12	import poppler
13	except ImportError:
14	pass
15
16	import mat
17	import parser
18	import archive
19	import pdfrw
20
21
22	class OpenDocumentStripper(archive.GenericArchiveStripper):
23	'''
24	An open document file is a zip, with xml file into.
25	The one that interest us is meta.xml
26	'''
27
28	def get_meta(self):
29	'''
30	Return a dict with all the meta of the file by
31	trying to read the meta.xml file.
32	'''
33	zipin = zipfile.ZipFile(self.filename, 'r')
34	metadata = {}
35	try:
36	content = zipin.read('meta.xml')
37	zipin.close()
38	metadata[self.filename] = 'harful meta'
39	except KeyError: # no meta.xml file found
40	logging.debug('%s has no opendocument metadata' % self.filename)
41	return metadata
42
43	def _remove_all(self, method):
44	'''
45	FIXME ?
46	There is a patch implementing the Zipfile.remove()
47	method here : http://bugs.python.org/issue6818
48	'''
49	zipin = zipfile.ZipFile(self.filename, 'r')
50	zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
51
52	for item in zipin.namelist():
53	name = os.path.join(self.tempdir, item)
54	_, ext = os.path.splitext(name)
55
56	if item.endswith('manifest.xml'):
57	# contain the list of all files present in the archive
58	zipin.extract(item, self.tempdir)
59	for line in fileinput.input(name, inplace=1):
60	#remove the line which contains "meta.xml"
61	line = line.strip()
62	if not 'meta.xml' in line:
63	print line
64	zipout.write(name, item)
65
66	elif ext in parser.NOMETA or item == 'mimetype':
67	#keep NOMETA files, and the "manifest" file
68	if item != 'meta.xml': # contains the metadata
69	zipin.extract(item, self.tempdir)
70	zipout.write(name, item)
71
72	else:
73	zipin.extract(item, self.tempdir)
74	if os.path.isfile(name):
75	try:
76	cfile = mat.create_class_file(name, False,
77	self.add2archive)
78	if method == 'normal':
79	cfile.remove_all()
80	else:
81	cfile.remove_all_ugly()
82	logging.debug('Processing %s from %s' % (item,
83	self.filename))
84	zipout.write(name, item)
85	except:
86	logging.info('%s\' fileformat is not supported' % item)
87	if self.add2archive:
88	zipout.write(name, item)
89	zipout.comment = ''
90	logging.info('%s treated' % self.filename)
91	zipin.close()
92	zipout.close()
93	self.do_backup()
94
95	def is_clean(self):
96	'''
97	Check if the file is clean from harmful metadatas
98	'''
99	zipin = zipfile.ZipFile(self.filename, 'r')
100	try:
101	zipin.getinfo('meta.xml')
102	except KeyError: # no meta.xml in the file
103	czf = archive.ZipStripper(self.filename, self.parser,
104	'application/zip', self.backup, self.add2archive)
105	if czf.is_clean():
106	zipin.close()
107	return True
108	zipin.close()
109	return False
110
111
112	class PdfStripper(parser.GenericParser):
113	'''
114	Represent a pdf file
115	'''
116	def __init__(self, filename, parser, mime, backup, add2archive):
117	super(PdfStripper, self).__init__(filename, parser, mime, backup,
118	add2archive)
119	uri = 'file://' + os.path.abspath(self.filename)
120	self.password = None
121	self.document = poppler.document_new_from_file(uri, self.password)
122	self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
123	'producer', 'creation-date', 'mod-date', 'metadata')
124
125	def is_clean(self):
126	'''
127	Check if the file is clean from harmful metadatas
128	'''
129	for key in self.meta_list:
130	if key == 'creation-date' or key == 'mod-date':
131	if self.document.get_property(key) != -1:
132	return False
133	elif self.document.get_property(key) is not None and \
134	self.document.get_property(key) != '':
135	return False
136	return True
137
138	def remove_all(self):
139	'''
140	Opening the pdf with poppler, then doing a render
141	on a cairo pdfsurface for each pages.
142	Thanks to Lunar^for the idea.
143	http://cairographics.org/documentation/pycairo/2/
144	python-poppler is not documented at all : have fun ;)
145	'''
146	page = self.document.get_page(0)
147	page_width, page_height = page.get_size()
148	surface = cairo.PDFSurface(self.output, page_width, page_height)
149	context = cairo.Context(surface) # context draws on the surface
150	logging.debug('Pdf rendering of %s' % self.filename)
151	for pagenum in xrange(self.document.get_n_pages()):
152	page = self.document.get_page(pagenum)
153	context.translate(0, 0)
154	page.render(context) # render the page on context
155	context.show_page() # draw context on surface
156	surface.finish()
157
158	#For now, poppler cannot write meta, so we must use pdfrw
159	logging.debug('Removing %s\'s superficial metadata' % self.filename)
160	trailer = pdfrw.PdfReader(self.output)
161	trailer.Info.Producer = trailer.Info.Creator = None
162	writer = pdfrw.PdfWriter()
163	writer.trailer = trailer
164	writer.write(self.output)
165	self.do_backup()
166
167	def get_meta(self):
168	'''
169	Return a dict with all the meta of the file
170	'''
171	metadata = {}
172	for key in self.meta_list:
173	if key == 'creation-date' or key == 'mod-date':
174	#creation and modification are set to -1
175	if self.document.get_property(key) != -1:
176	metadata[key] = self.document.get_property(key)
177	elif self.document.get_property(key) is not None and \
178	self.document.get_property(key) != '':
179	metadata[key] = self.document.get_property(key)
180	return metadata
181
182
183	class OpenXmlStripper(archive.GenericArchiveStripper):
184	'''
185	Represent an office openxml document, which is like
186	an opendocument format, with some tricky stuff added.
187	It contains mostly xml, but can have media blobs, crap, ...
188	(I don't like this format.)
189	'''
190	def _remove_all(self, method):
191	'''
192	FIXME ?
193	There is a patch implementing the Zipfile.remove()
194	method here : http://bugs.python.org/issue6818
195	'''
196	zipin = zipfile.ZipFile(self.filename, 'r')
197	zipout = zipfile.ZipFile(self.output, 'w',
198	allowZip64=True)
199	for item in zipin.namelist():
200	name = os.path.join(self.tempdir, item)
201	_, ext = os.path.splitext(name)
202	if item.startswith('docProps/'): # metadatas
203	pass
204	elif ext in parser.NOMETA or item == '.rels':
205	#keep parser.NOMETA files, and the file named ".rels"
206	zipin.extract(item, self.tempdir)
207	zipout.write(name, item)
208	else:
209	zipin.extract(item, self.tempdir)
210	if os.path.isfile(name): # don't care about folders
211	try:
212	cfile = mat.create_class_file(name, False,
213	self.add2archive)
214	if method == 'normal':
215	cfile.remove_all()
216	else:
217	cfile.remove_all_ugly()
218	logging.debug('Processing %s from %s' % (item,
219	self.filename))
220	zipout.write(name, item)
221	except:
222	logging.info('%s\' fileformat is not supported' % item)
223	if self.add2archive:
224	zipout.write(name, item)
225	zipout.comment = ''
226	logging.info('%s treated' % self.filename)
227	zipin.close()
228	zipout.close()
229	self.do_backup()
230
231	def is_clean(self):
232	'''
233	Check if the file is clean from harmful metadatas
234	'''
235	zipin = zipfile.ZipFile(self.filename, 'r')
236	for item in zipin.namelist():
237	if item.startswith('docProps/'):
238	return False
239	zipin.close()
240	czf = archive.ZipStripper(self.filename, self.parser,
241	'application/zip', self.backup, self.add2archive)
242	if not czf.is_clean():
243	return False
244	else:
245	return True
246
247	def get_meta(self):
248	'''
249	Return a dict with all the meta of the file
250	'''
251	zipin = zipfile.ZipFile(self.filename, 'r')
252	metadata = {}
253	for item in zipin.namelist():
254	if item.startswith('docProps/'):
255	metadata[item] = 'harmful content'
256	zipin.close()
257	return metadata