Reorganize source tree and files installation location, cleanup setup.py (Closes: #689409)

author: jvoisin 2012-12-08 02:02:25 +0100
committer: jvoisin 2012-12-13 14:24:01 +0100
commit: cbf8a2a65928694202e19b6bcf56ec84bcbf613c (patch)
tree: e106475b0d5c003505336b5ae6416e4508bb768b /MAT/office.py
parent: 67d5c1fa6b9ab6e1e7328ee57b15d8e46526d72a (diff)
1 files changed, 265 insertions, 0 deletions
diff --git a/MAT/office.py b/MAT/office.py
new file mode 100644
index 0000000..d14125b
--- /dev/null
+++ b/MAT/office.py
@@ -0,0 +1,265 @@
+'''
+    Care about office's formats
+'''
+import os
+import logging
+import zipfile
+import fileinput
+import xml.dom.minidom as minidom
+try:
+    import cairo
+    import poppler
+except ImportError:
+    pass
+import mat
+import parser
+import archive
+class OpenDocumentStripper(archive.GenericArchiveStripper):
+    '''
+        An open document file is a zip, with xml file into.
+        The one that interest us is meta.xml
+    '''
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file by
+            trying to read the meta.xml file.
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        metadata = {}
+        try:
+            content = zipin.read('meta.xml')
+            dom1 = minidom.parseString(content)
+            elements = dom1.getElementsByTagName('office:meta')
+            for i in elements[0].childNodes:
+                if i.tagName != 'meta:document-statistic':
+                    nodename = ''.join([k for k in i.nodeName.split(':')[1:]])
+                    metadata[nodename] = ''.join([j.data for j in i.childNodes])
+                else:
+                    # thank you w3c for not providing a nice
+                    # method to get all attributes from a node
+                    pass
+            zipin.close()
+        except KeyError:  # no meta.xml file found
+            logging.debug('%s has no opendocument metadata' % self.filename)
+        return metadata
+    def _remove_all(self):
+        '''
+            FIXME ?
+            There is a patch implementing the Zipfile.remove()
+            method here : http://bugs.python.org/issue6818
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
+        for item in zipin.namelist():
+            name = os.path.join(self.tempdir, item)
+            _, ext = os.path.splitext(name)
+            if item.endswith('manifest.xml'):
+            # contain the list of all files present in the archive
+                zipin.extract(item, self.tempdir)
+                for line in fileinput.input(name, inplace=1):
+                    #remove the line which contains "meta.xml"
+                    line = line.strip()
+                    if not 'meta.xml' in line:
+                        print line
+                zipout.write(name, item)
+            elif ext in parser.NOMETA or item == 'mimetype':
+                #keep NOMETA files, and the "manifest" file
+                if item != 'meta.xml':  # contains the metadata
+                    zipin.extract(item, self.tempdir)
+                    zipout.write(name, item)
+            else:
+                zipin.extract(item, self.tempdir)
+                if os.path.isfile(name):
+                    try:
+                        cfile = mat.create_class_file(name, False,
+                            self.add2archive)
+                        cfile.remove_all()
+                        logging.debug('Processing %s from %s' % (item,
+                            self.filename))
+                        zipout.write(name, item)
+                    except:
+                        logging.info('%s\' fileformat is not supported' % item)
+                        if self.add2archive:
+                            zipout.write(name, item)
+        zipout.comment = ''
+        logging.info('%s treated' % self.filename)
+        zipin.close()
+        zipout.close()
+        self.do_backup()
+        return True
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        try:
+            zipin.getinfo('meta.xml')
+        except KeyError:  # no meta.xml in the file
+            czf = archive.ZipStripper(self.filename, self.parser,
+                'application/zip', self.backup, self.add2archive)
+            if czf.is_clean():
+                zipin.close()
+                return True
+        zipin.close()
+        return False
+class PdfStripper(parser.GenericParser):
+    '''
+        Represent a PDF file
+    '''
+    def __init__(self, filename, parser, mime, backup, add2archive):
+        super(PdfStripper, self).__init__(filename, parser, mime, backup,
+            add2archive)
+        uri = 'file://' + os.path.abspath(self.filename)
+        self.password = None
+        self.document = poppler.document_new_from_file(uri, self.password)
+        self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator',
+            'producer', 'metadata'])
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        for key in self.meta_list:
+            if self.document.get_property(key):
+                return False
+        return True
+    def remove_all(self):
+        '''
+            Remove metadata
+        '''
+        return self._remove_meta()
+    def _remove_meta(self):
+        '''
+            Opening the PDF with poppler, then doing a render
+            on a cairo pdfsurface for each pages.
+            http://cairographics.org/documentation/pycairo/2/
+            python-poppler is not documented at all : have fun ;)
+        '''
+        page = self.document.get_page(0)
+        # assume that every pages are the same size
+        page_width, page_height = page.get_size()
+        surface = cairo.PDFSurface(self.output, page_width, page_height)
+        context = cairo.Context(surface)  # context draws on the surface
+        logging.debug('PDF rendering of %s' % self.filename)
+        for pagenum in xrange(self.document.get_n_pages()):
+            page = self.document.get_page(pagenum)
+            context.translate(0, 0)
+            page.render_for_printing(context)  # render the page on context
+            context.show_page()  # draw context on surface
+        surface.finish()
+        try:
+            import pdfrw  # For now, poppler cannot write meta, so we must use pdfrw
+            logging.debug('Removing %s\'s superficial metadata' % self.filename)
+            trailer = pdfrw.PdfReader(self.output)
+            trailer.Info.Producer = None
+            trailer.Info.Creator = None
+            writer = pdfrw.PdfWriter()
+            writer.trailer = trailer
+            writer.write(self.output)
+            self.do_backup()
+            return True
+        except:
+            print('Unable to remove all metadata from %s, please install\
+pdfrw' % self.output)
+            return False
+        return True
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        metadata = {}
+        for key in self.meta_list:
+            if self.document.get_property(key):
+                metadata[key] = self.document.get_property(key)
+        return metadata
+class OpenXmlStripper(archive.GenericArchiveStripper):
+    '''
+        Represent an office openxml document, which is like
+        an opendocument format, with some tricky stuff added.
+        It contains mostly xml, but can have media blobs, crap, ...
+        (I don't like this format.)
+    '''
+    def _remove_all(self):
+        '''
+            FIXME ?
+            There is a patch implementing the Zipfile.remove()
+            method here : http://bugs.python.org/issue6818
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        zipout = zipfile.ZipFile(self.output, 'w',
+            allowZip64=True)
+        for item in zipin.namelist():
+            name = os.path.join(self.tempdir, item)
+            _, ext = os.path.splitext(name)
+            if item.startswith('docProps/'):  # metadatas
+                pass
+            elif ext in parser.NOMETA or item == '.rels':
+                #keep parser.NOMETA files, and the file named ".rels"
+                zipin.extract(item, self.tempdir)
+                zipout.write(name, item)
+            else:
+                zipin.extract(item, self.tempdir)
+                if os.path.isfile(name):  # don't care about folders
+                    try:
+                        cfile = mat.create_class_file(name, False,
+                            self.add2archive)
+                        cfile.remove_all()
+                        logging.debug('Processing %s from %s' % (item,
+                            self.filename))
+                        zipout.write(name, item)
+                    except:
+                        logging.info('%s\' fileformat is not supported' % item)
+                        if self.add2archive:
+                            zipout.write(name, item)
+        zipout.comment = ''
+        logging.info('%s treated' % self.filename)
+        zipin.close()
+        zipout.close()
+        self.do_backup()
+        return True
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                return False
+        zipin.close()
+        czf = archive.ZipStripper(self.filename, self.parser,
+                'application/zip', self.backup, self.add2archive)
+        return czf.is_clean()
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        metadata = {}
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                metadata[item] = 'harmful content'
+        zipin.close()
+        return metadata
author	jvoisin	2012-12-08 02:02:25 +0100
committer	jvoisin	2012-12-13 14:24:01 +0100
commit	cbf8a2a65928694202e19b6bcf56ec84bcbf613c (patch)
tree	e106475b0d5c003505336b5ae6416e4508bb768b /MAT/office.py
parent	67d5c1fa6b9ab6e1e7328ee57b15d8e46526d72a (diff)

diff --git a/MAT/office.py b/MAT/office.py new file mode 100644 index 0000000..d14125b --- /dev/null +++ b/MAT/office.py
@@ -0,0 +1,265 @@
	1	'''
	2	Care about office's formats
	3	'''
	4
	5	import os
	6	import logging
	7	import zipfile
	8	import fileinput
	9	import xml.dom.minidom as minidom
	10
	11	try:
	12	import cairo
	13	import poppler
	14	except ImportError:
	15	pass
	16
	17	import mat
	18	import parser
	19	import archive
	20
	21
	22	class OpenDocumentStripper(archive.GenericArchiveStripper):
	23	'''
	24	An open document file is a zip, with xml file into.
	25	The one that interest us is meta.xml
	26	'''
	27
	28	def get_meta(self):
	29	'''
	30	Return a dict with all the meta of the file by
	31	trying to read the meta.xml file.
	32	'''
	33	zipin = zipfile.ZipFile(self.filename, 'r')
	34	metadata = {}
	35	try:
	36	content = zipin.read('meta.xml')
	37	dom1 = minidom.parseString(content)
	38	elements = dom1.getElementsByTagName('office:meta')
	39	for i in elements[0].childNodes:
	40	if i.tagName != 'meta:document-statistic':
	41	nodename = ''.join([k for k in i.nodeName.split(':')[1:]])
	42	metadata[nodename] = ''.join([j.data for j in i.childNodes])
	43	else:
	44	# thank you w3c for not providing a nice
	45	# method to get all attributes from a node
	46	pass
	47	zipin.close()
	48	except KeyError: # no meta.xml file found
	49	logging.debug('%s has no opendocument metadata' % self.filename)
	50	return metadata
	51
	52	def _remove_all(self):
	53	'''
	54	FIXME ?
	55	There is a patch implementing the Zipfile.remove()
	56	method here : http://bugs.python.org/issue6818
	57	'''
	58	zipin = zipfile.ZipFile(self.filename, 'r')
	59	zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
	60
	61	for item in zipin.namelist():
	62	name = os.path.join(self.tempdir, item)
	63	_, ext = os.path.splitext(name)
	64
	65	if item.endswith('manifest.xml'):
	66	# contain the list of all files present in the archive
	67	zipin.extract(item, self.tempdir)
	68	for line in fileinput.input(name, inplace=1):
	69	#remove the line which contains "meta.xml"
	70	line = line.strip()
	71	if not 'meta.xml' in line:
	72	print line
	73	zipout.write(name, item)
	74
	75	elif ext in parser.NOMETA or item == 'mimetype':
	76	#keep NOMETA files, and the "manifest" file
	77	if item != 'meta.xml': # contains the metadata
	78	zipin.extract(item, self.tempdir)
	79	zipout.write(name, item)
	80
	81	else:
	82	zipin.extract(item, self.tempdir)
	83	if os.path.isfile(name):
	84	try:
	85	cfile = mat.create_class_file(name, False,
	86	self.add2archive)
	87	cfile.remove_all()
	88	logging.debug('Processing %s from %s' % (item,
	89	self.filename))
	90	zipout.write(name, item)
	91	except:
	92	logging.info('%s\' fileformat is not supported' % item)
	93	if self.add2archive:
	94	zipout.write(name, item)
	95	zipout.comment = ''
	96	logging.info('%s treated' % self.filename)
	97	zipin.close()
	98	zipout.close()
	99	self.do_backup()
	100	return True
	101
	102	def is_clean(self):
	103	'''
	104	Check if the file is clean from harmful metadatas
	105	'''
	106	zipin = zipfile.ZipFile(self.filename, 'r')
	107	try:
	108	zipin.getinfo('meta.xml')
	109	except KeyError: # no meta.xml in the file
	110	czf = archive.ZipStripper(self.filename, self.parser,
	111	'application/zip', self.backup, self.add2archive)
	112	if czf.is_clean():
	113	zipin.close()
	114	return True
	115	zipin.close()
	116	return False
	117
	118
	119	class PdfStripper(parser.GenericParser):
	120	'''
	121	Represent a PDF file
	122	'''
	123	def __init__(self, filename, parser, mime, backup, add2archive):
	124	super(PdfStripper, self).__init__(filename, parser, mime, backup,
	125	add2archive)
	126	uri = 'file://' + os.path.abspath(self.filename)
	127	self.password = None
	128	self.document = poppler.document_new_from_file(uri, self.password)
	129	self.meta_list = frozenset(['title', 'author', 'subject', 'keywords', 'creator',
	130	'producer', 'metadata'])
	131
	132	def is_clean(self):
	133	'''
	134	Check if the file is clean from harmful metadatas
	135	'''
	136	for key in self.meta_list:
	137	if self.document.get_property(key):
	138	return False
	139	return True
	140
	141	def remove_all(self):
	142	'''
	143	Remove metadata
	144	'''
	145	return self._remove_meta()
	146
	147	def _remove_meta(self):
	148	'''
	149	Opening the PDF with poppler, then doing a render
	150	on a cairo pdfsurface for each pages.
	151
	152	http://cairographics.org/documentation/pycairo/2/
	153	python-poppler is not documented at all : have fun ;)
	154	'''
	155	page = self.document.get_page(0)
	156	# assume that every pages are the same size
	157	page_width, page_height = page.get_size()
	158	surface = cairo.PDFSurface(self.output, page_width, page_height)
	159	context = cairo.Context(surface) # context draws on the surface
	160	logging.debug('PDF rendering of %s' % self.filename)
	161	for pagenum in xrange(self.document.get_n_pages()):
	162	page = self.document.get_page(pagenum)
	163	context.translate(0, 0)
	164	page.render_for_printing(context) # render the page on context
	165	context.show_page() # draw context on surface
	166	surface.finish()
	167
	168	try:
	169	import pdfrw # For now, poppler cannot write meta, so we must use pdfrw
	170	logging.debug('Removing %s\'s superficial metadata' % self.filename)
	171	trailer = pdfrw.PdfReader(self.output)
	172	trailer.Info.Producer = None
	173	trailer.Info.Creator = None
	174	writer = pdfrw.PdfWriter()
	175	writer.trailer = trailer
	176	writer.write(self.output)
	177	self.do_backup()
	178	return True
	179	except:
	180	print('Unable to remove all metadata from %s, please install\
	181	pdfrw' % self.output)
	182	return False
	183	return True
	184
	185	def get_meta(self):
	186	'''
	187	Return a dict with all the meta of the file
	188	'''
	189	metadata = {}
	190	for key in self.meta_list:
	191	if self.document.get_property(key):
	192	metadata[key] = self.document.get_property(key)
	193	return metadata
	194
	195
	196	class OpenXmlStripper(archive.GenericArchiveStripper):
	197	'''
	198	Represent an office openxml document, which is like
	199	an opendocument format, with some tricky stuff added.
	200	It contains mostly xml, but can have media blobs, crap, ...
	201	(I don't like this format.)
	202	'''
	203	def _remove_all(self):
	204	'''
	205	FIXME ?
	206	There is a patch implementing the Zipfile.remove()
	207	method here : http://bugs.python.org/issue6818
	208	'''
	209	zipin = zipfile.ZipFile(self.filename, 'r')
	210	zipout = zipfile.ZipFile(self.output, 'w',
	211	allowZip64=True)
	212	for item in zipin.namelist():
	213	name = os.path.join(self.tempdir, item)
	214	_, ext = os.path.splitext(name)
	215	if item.startswith('docProps/'): # metadatas
	216	pass
	217	elif ext in parser.NOMETA or item == '.rels':
	218	#keep parser.NOMETA files, and the file named ".rels"
	219	zipin.extract(item, self.tempdir)
	220	zipout.write(name, item)
	221	else:
	222	zipin.extract(item, self.tempdir)
	223	if os.path.isfile(name): # don't care about folders
	224	try:
	225	cfile = mat.create_class_file(name, False,
	226	self.add2archive)
	227	cfile.remove_all()
	228	logging.debug('Processing %s from %s' % (item,
	229	self.filename))
	230	zipout.write(name, item)
	231	except:
	232	logging.info('%s\' fileformat is not supported' % item)
	233	if self.add2archive:
	234	zipout.write(name, item)
	235	zipout.comment = ''
	236	logging.info('%s treated' % self.filename)
	237	zipin.close()
	238	zipout.close()
	239	self.do_backup()
	240	return True
	241
	242	def is_clean(self):
	243	'''
	244	Check if the file is clean from harmful metadatas
	245	'''
	246	zipin = zipfile.ZipFile(self.filename, 'r')
	247	for item in zipin.namelist():
	248	if item.startswith('docProps/'):
	249	return False
	250	zipin.close()
	251	czf = archive.ZipStripper(self.filename, self.parser,
	252	'application/zip', self.backup, self.add2archive)
	253	return czf.is_clean()
	254
	255	def get_meta(self):
	256	'''
	257	Return a dict with all the meta of the file
	258	'''
	259	zipin = zipfile.ZipFile(self.filename, 'r')
	260	metadata = {}
	261	for item in zipin.namelist():
	262	if item.startswith('docProps/'):
	263	metadata[item] = 'harmful content'
	264	zipin.close()
	265	return metadata