1 files changed, 305 insertions, 0 deletions
diff --git a/lib/office.py b/lib/office.py
new file mode 100644
index 0000000..e1d738e
--- /dev/null
+++ b/lib/office.py
@@ -0,0 +1,305 @@
+'''
+    Care about office's formats
+'''
+import os
+import logging
+import zipfile
+import fileinput
+import subprocess
+import xml.dom.minidom as minidom
+try:
+    import cairo
+    import poppler
+except ImportError:
+    pass
+import mat
+import parser
+import archive
+class OpenDocumentStripper(archive.GenericArchiveStripper):
+    '''
+        An open document file is a zip, with xml file into.
+        The one that interest us is meta.xml
+    '''
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file by
+            trying to read the meta.xml file.
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        metadata = {}
+        try:
+            content = zipin.read('meta.xml')
+            dom1 = minidom.parseString(content)
+            elements = dom1.getElementsByTagName('office:meta')
+            for i in elements[0].childNodes:
+                if i.tagName != 'meta:document-statistic':
+                    nodename = ''.join([k for k in i.nodeName.split(':')[1:]])
+                    metadata[nodename] = ''.join([j.data for j in i.childNodes])
+                else:
+                    # thank you w3c for not providing a nice
+                    # method to get all attributes from a node
+                    pass
+            zipin.close()
+        except KeyError:  # no meta.xml file found
+            logging.debug('%s has no opendocument metadata' % self.filename)
+        return metadata
+    def _remove_all(self, method):
+        '''
+            FIXME ?
+            There is a patch implementing the Zipfile.remove()
+            method here : http://bugs.python.org/issue6818
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
+        for item in zipin.namelist():
+            name = os.path.join(self.tempdir, item)
+            _, ext = os.path.splitext(name)
+            if item.endswith('manifest.xml'):
+            # contain the list of all files present in the archive
+                zipin.extract(item, self.tempdir)
+                for line in fileinput.input(name, inplace=1):
+                    #remove the line which contains "meta.xml"
+                    line = line.strip()
+                    if not 'meta.xml' in line:
+                        print line
+                zipout.write(name, item)
+            elif ext in parser.NOMETA or item == 'mimetype':
+                #keep NOMETA files, and the "manifest" file
+                if item != 'meta.xml':  # contains the metadata
+                    zipin.extract(item, self.tempdir)
+                    zipout.write(name, item)
+            else:
+                zipin.extract(item, self.tempdir)
+                if os.path.isfile(name):
+                    try:
+                        cfile = mat.create_class_file(name, False,
+                            self.add2archive)
+                        if method == 'normal':
+                            cfile.remove_all()
+                        else:
+                            cfile.remove_all_strict()
+                        logging.debug('Processing %s from %s' % (item,
+                            self.filename))
+                        zipout.write(name, item)
+                    except:
+                        logging.info('%s\' fileformat is not supported' % item)
+                        if self.add2archive:
+                            zipout.write(name, item)
+        zipout.comment = ''
+        logging.info('%s treated' % self.filename)
+        zipin.close()
+        zipout.close()
+        self.do_backup()
+        return True
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        try:
+            zipin.getinfo('meta.xml')
+        except KeyError:  # no meta.xml in the file
+            czf = archive.ZipStripper(self.filename, self.parser,
+                'application/zip', self.backup, self.add2archive)
+            if czf.is_clean():
+                zipin.close()
+                return True
+        zipin.close()
+        return False
+class PdfStripper(parser.GenericParser):
+    '''
+        Represent a PDF file
+    '''
+    def __init__(self, filename, parser, mime, backup, add2archive):
+        super(PdfStripper, self).__init__(filename, parser, mime, backup,
+            add2archive)
+        uri = 'file://' + os.path.abspath(self.filename)
+        self.password = None
+        self.document = poppler.document_new_from_file(uri, self.password)
+        self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
+            'producer', 'metadata')
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        for key in self.meta_list:
+            if self.document.get_property(key) is not None and \
+                self.document.get_property(key) != '':
+                return False
+        return True
+    def remove_all(self):
+        '''
+            Remove supperficial
+        '''
+        return self._remove_meta()
+    def remove_all_strict(self):
+        '''
+            Opening the PDF with poppler, then doing a render
+            on a cairo pdfsurface for each pages.
+            Thanks to Lunar^for the idea.
+            http://cairographics.org/documentation/pycairo/2/
+            python-poppler is not documented at all : have fun ;)
+        '''
+        page = self.document.get_page(0)
+        page_width, page_height = page.get_size()
+        surface = cairo.PDFSurface(self.output, page_width, page_height)
+        context = cairo.Context(surface)  # context draws on the surface
+        logging.debug('PDF rendering of %s' % self.filename)
+        for pagenum in xrange(self.document.get_n_pages()):
+            page = self.document.get_page(pagenum)
+            context.translate(0, 0)
+            page.render(context)  # render the page on context
+            context.show_page()  # draw context on surface
+        surface.finish()
+        return self._remove_meta()
+    def _remove_meta(self):
+        '''
+            Remove superficial/external metadata
+            from a PDF file, using exiftool,
+            of pdfrw if exiftool is not installed
+        '''
+        processed = False
+        try:# try with pdfrw
+            import pdfrw
+            #For now, poppler cannot write meta, so we must use pdfrw
+            logging.debug('Removing %s\'s superficial metadata' % self.filename)
+            trailer = pdfrw.PdfReader(self.output)
+            trailer.Info.Producer = trailer.Author = trailer.Info.Creator = None
+            writer = pdfrw.PdfWriter()
+            writer.trailer = trailer
+            writer.write(self.output)
+            self.do_backup()
+            processed = True
+        except:
+            pass
+        try:  # try with exiftool
+            subprocess.Popen('exiftool', stdout=open('/dev/null'))
+            import exiftool
+            # Note: '-All=' must be followed by a known exiftool option.
+            if self.backup:
+                process = subprocess.Popen(['exiftool', '-m', '-All=',
+                    '-out', self.output, self.filename], stdout=open('/dev/null'))
+                process.wait()
+            else:
+                # Note: '-All=' must be followed by a known exiftool option.
+                process = subprocess.Popen(
+                    ['exiftool', '-All=', '-overwrite_original', self.filename],
+                    stdout=open('/dev/null'))
+                process.wait()
+            processed = True
+        except:
+            pass
+        if processed is False:
+            logging.error('Please install either pdfrw, or exiftool to\
+                    fully handle PDF files')
+        return processed
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        metadata = {}
+        for key in self.meta_list:
+            if self.document.get_property(key) is not None and \
+                self.document.get_property(key) != '':
+                metadata[key] = self.document.get_property(key)
+        return metadata
+class OpenXmlStripper(archive.GenericArchiveStripper):
+    '''
+        Represent an office openxml document, which is like
+        an opendocument format, with some tricky stuff added.
+        It contains mostly xml, but can have media blobs, crap, ...
+        (I don't like this format.)
+    '''
+    def _remove_all(self, method):
+        '''
+            FIXME ?
+            There is a patch implementing the Zipfile.remove()
+            method here : http://bugs.python.org/issue6818
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        zipout = zipfile.ZipFile(self.output, 'w',
+            allowZip64=True)
+        for item in zipin.namelist():
+            name = os.path.join(self.tempdir, item)
+            _, ext = os.path.splitext(name)
+            if item.startswith('docProps/'):  # metadatas
+                pass
+            elif ext in parser.NOMETA or item == '.rels':
+                #keep parser.NOMETA files, and the file named ".rels"
+                zipin.extract(item, self.tempdir)
+                zipout.write(name, item)
+            else:
+                zipin.extract(item, self.tempdir)
+                if os.path.isfile(name):  # don't care about folders
+                    try:
+                        cfile = mat.create_class_file(name, False,
+                            self.add2archive)
+                        if method == 'normal':
+                            cfile.remove_all()
+                        else:
+                            cfile.remove_all_strict()
+                        logging.debug('Processing %s from %s' % (item,
+                            self.filename))
+                        zipout.write(name, item)
+                    except:
+                        logging.info('%s\' fileformat is not supported' % item)
+                        if self.add2archive:
+                            zipout.write(name, item)
+        zipout.comment = ''
+        logging.info('%s treated' % self.filename)
+        zipin.close()
+        zipout.close()
+        self.do_backup()
+        return True
+    def is_clean(self):
+        '''
+            Check if the file is clean from harmful metadatas
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                return False
+        zipin.close()
+        czf = archive.ZipStripper(self.filename, self.parser,
+                'application/zip', self.backup, self.add2archive)
+        if not czf.is_clean():
+            return False
+        else:
+            return True
+    def get_meta(self):
+        '''
+            Return a dict with all the meta of the file
+        '''
+        zipin = zipfile.ZipFile(self.filename, 'r')
+        metadata = {}
+        for item in zipin.namelist():
+            if item.startswith('docProps/'):
+                metadata[item] = 'harmful content'
+        zipin.close()
+        return metadata

diff --git a/lib/office.py b/lib/office.py new file mode 100644 index 0000000..e1d738e --- /dev/null +++ b/lib/office.py
@@ -0,0 +1,305 @@
	1	'''
	2	Care about office's formats
	3	'''
	4
	5	import os
	6	import logging
	7	import zipfile
	8	import fileinput
	9	import subprocess
	10	import xml.dom.minidom as minidom
	11
	12	try:
	13	import cairo
	14	import poppler
	15	except ImportError:
	16	pass
	17
	18	import mat
	19	import parser
	20	import archive
	21
	22	class OpenDocumentStripper(archive.GenericArchiveStripper):
	23	'''
	24	An open document file is a zip, with xml file into.
	25	The one that interest us is meta.xml
	26	'''
	27
	28	def get_meta(self):
	29	'''
	30	Return a dict with all the meta of the file by
	31	trying to read the meta.xml file.
	32	'''
	33	zipin = zipfile.ZipFile(self.filename, 'r')
	34	metadata = {}
	35	try:
	36	content = zipin.read('meta.xml')
	37	dom1 = minidom.parseString(content)
	38	elements = dom1.getElementsByTagName('office:meta')
	39	for i in elements[0].childNodes:
	40	if i.tagName != 'meta:document-statistic':
	41	nodename = ''.join([k for k in i.nodeName.split(':')[1:]])
	42	metadata[nodename] = ''.join([j.data for j in i.childNodes])
	43	else:
	44	# thank you w3c for not providing a nice
	45	# method to get all attributes from a node
	46	pass
	47	zipin.close()
	48	except KeyError: # no meta.xml file found
	49	logging.debug('%s has no opendocument metadata' % self.filename)
	50	return metadata
	51
	52	def _remove_all(self, method):
	53	'''
	54	FIXME ?
	55	There is a patch implementing the Zipfile.remove()
	56	method here : http://bugs.python.org/issue6818
	57	'''
	58	zipin = zipfile.ZipFile(self.filename, 'r')
	59	zipout = zipfile.ZipFile(self.output, 'w', allowZip64=True)
	60
	61	for item in zipin.namelist():
	62	name = os.path.join(self.tempdir, item)
	63	_, ext = os.path.splitext(name)
	64
	65	if item.endswith('manifest.xml'):
	66	# contain the list of all files present in the archive
	67	zipin.extract(item, self.tempdir)
	68	for line in fileinput.input(name, inplace=1):
	69	#remove the line which contains "meta.xml"
	70	line = line.strip()
	71	if not 'meta.xml' in line:
	72	print line
	73	zipout.write(name, item)
	74
	75	elif ext in parser.NOMETA or item == 'mimetype':
	76	#keep NOMETA files, and the "manifest" file
	77	if item != 'meta.xml': # contains the metadata
	78	zipin.extract(item, self.tempdir)
	79	zipout.write(name, item)
	80
	81	else:
	82	zipin.extract(item, self.tempdir)
	83	if os.path.isfile(name):
	84	try:
	85	cfile = mat.create_class_file(name, False,
	86	self.add2archive)
	87	if method == 'normal':
	88	cfile.remove_all()
	89	else:
	90	cfile.remove_all_strict()
	91	logging.debug('Processing %s from %s' % (item,
	92	self.filename))
	93	zipout.write(name, item)
	94	except:
	95	logging.info('%s\' fileformat is not supported' % item)
	96	if self.add2archive:
	97	zipout.write(name, item)
	98	zipout.comment = ''
	99	logging.info('%s treated' % self.filename)
	100	zipin.close()
	101	zipout.close()
	102	self.do_backup()
	103	return True
	104
	105	def is_clean(self):
	106	'''
	107	Check if the file is clean from harmful metadatas
	108	'''
	109	zipin = zipfile.ZipFile(self.filename, 'r')
	110	try:
	111	zipin.getinfo('meta.xml')
	112	except KeyError: # no meta.xml in the file
	113	czf = archive.ZipStripper(self.filename, self.parser,
	114	'application/zip', self.backup, self.add2archive)
	115	if czf.is_clean():
	116	zipin.close()
	117	return True
	118	zipin.close()
	119	return False
	120
	121
	122	class PdfStripper(parser.GenericParser):
	123	'''
	124	Represent a PDF file
	125	'''
	126	def __init__(self, filename, parser, mime, backup, add2archive):
	127	super(PdfStripper, self).__init__(filename, parser, mime, backup,
	128	add2archive)
	129	uri = 'file://' + os.path.abspath(self.filename)
	130	self.password = None
	131	self.document = poppler.document_new_from_file(uri, self.password)
	132	self.meta_list = ('title', 'author', 'subject', 'keywords', 'creator',
	133	'producer', 'metadata')
	134
	135	def is_clean(self):
	136	'''
	137	Check if the file is clean from harmful metadatas
	138	'''
	139	for key in self.meta_list:
	140	if self.document.get_property(key) is not None and \
	141	self.document.get_property(key) != '':
	142	return False
	143	return True
	144
	145
	146	def remove_all(self):
	147	'''
	148	Remove supperficial
	149	'''
	150	return self._remove_meta()
	151
	152
	153	def remove_all_strict(self):
	154	'''
	155	Opening the PDF with poppler, then doing a render
	156	on a cairo pdfsurface for each pages.
	157	Thanks to Lunar^for the idea.
	158	http://cairographics.org/documentation/pycairo/2/
	159	python-poppler is not documented at all : have fun ;)
	160	'''
	161	page = self.document.get_page(0)
	162	page_width, page_height = page.get_size()
	163	surface = cairo.PDFSurface(self.output, page_width, page_height)
	164	context = cairo.Context(surface) # context draws on the surface
	165	logging.debug('PDF rendering of %s' % self.filename)
	166	for pagenum in xrange(self.document.get_n_pages()):
	167	page = self.document.get_page(pagenum)
	168	context.translate(0, 0)
	169	page.render(context) # render the page on context
	170	context.show_page() # draw context on surface
	171	surface.finish()
	172	return self._remove_meta()
	173
	174	def _remove_meta(self):
	175	'''
	176	Remove superficial/external metadata
	177	from a PDF file, using exiftool,
	178	of pdfrw if exiftool is not installed
	179	'''
	180	processed = False
	181	try:# try with pdfrw
	182	import pdfrw
	183	#For now, poppler cannot write meta, so we must use pdfrw
	184	logging.debug('Removing %s\'s superficial metadata' % self.filename)
	185	trailer = pdfrw.PdfReader(self.output)
	186	trailer.Info.Producer = trailer.Author = trailer.Info.Creator = None
	187	writer = pdfrw.PdfWriter()
	188	writer.trailer = trailer
	189	writer.write(self.output)
	190	self.do_backup()
	191	processed = True
	192	except:
	193	pass
	194
	195	try: # try with exiftool
	196	subprocess.Popen('exiftool', stdout=open('/dev/null'))
	197	import exiftool
	198	# Note: '-All=' must be followed by a known exiftool option.
	199	if self.backup:
	200	process = subprocess.Popen(['exiftool', '-m', '-All=',
	201	'-out', self.output, self.filename], stdout=open('/dev/null'))
	202	process.wait()
	203	else:
	204	# Note: '-All=' must be followed by a known exiftool option.
	205	process = subprocess.Popen(
	206	['exiftool', '-All=', '-overwrite_original', self.filename],
	207	stdout=open('/dev/null'))
	208	process.wait()
	209	processed = True
	210	except:
	211	pass
	212
	213	if processed is False:
	214	logging.error('Please install either pdfrw, or exiftool to\
	215	fully handle PDF files')
	216	return processed
	217
	218	def get_meta(self):
	219	'''
	220	Return a dict with all the meta of the file
	221	'''
	222	metadata = {}
	223	for key in self.meta_list:
	224	if self.document.get_property(key) is not None and \
	225	self.document.get_property(key) != '':
	226	metadata[key] = self.document.get_property(key)
	227	return metadata
	228
	229
	230	class OpenXmlStripper(archive.GenericArchiveStripper):
	231	'''
	232	Represent an office openxml document, which is like
	233	an opendocument format, with some tricky stuff added.
	234	It contains mostly xml, but can have media blobs, crap, ...
	235	(I don't like this format.)
	236	'''
	237	def _remove_all(self, method):
	238	'''
	239	FIXME ?
	240	There is a patch implementing the Zipfile.remove()
	241	method here : http://bugs.python.org/issue6818
	242	'''
	243	zipin = zipfile.ZipFile(self.filename, 'r')
	244	zipout = zipfile.ZipFile(self.output, 'w',
	245	allowZip64=True)
	246	for item in zipin.namelist():
	247	name = os.path.join(self.tempdir, item)
	248	_, ext = os.path.splitext(name)
	249	if item.startswith('docProps/'): # metadatas
	250	pass
	251	elif ext in parser.NOMETA or item == '.rels':
	252	#keep parser.NOMETA files, and the file named ".rels"
	253	zipin.extract(item, self.tempdir)
	254	zipout.write(name, item)
	255	else:
	256	zipin.extract(item, self.tempdir)
	257	if os.path.isfile(name): # don't care about folders
	258	try:
	259	cfile = mat.create_class_file(name, False,
	260	self.add2archive)
	261	if method == 'normal':
	262	cfile.remove_all()
	263	else:
	264	cfile.remove_all_strict()
	265	logging.debug('Processing %s from %s' % (item,
	266	self.filename))
	267	zipout.write(name, item)
	268	except:
	269	logging.info('%s\' fileformat is not supported' % item)
	270	if self.add2archive:
	271	zipout.write(name, item)
	272	zipout.comment = ''
	273	logging.info('%s treated' % self.filename)
	274	zipin.close()
	275	zipout.close()
	276	self.do_backup()
	277	return True
	278
	279	def is_clean(self):
	280	'''
	281	Check if the file is clean from harmful metadatas
	282	'''
	283	zipin = zipfile.ZipFile(self.filename, 'r')
	284	for item in zipin.namelist():
	285	if item.startswith('docProps/'):
	286	return False
	287	zipin.close()
	288	czf = archive.ZipStripper(self.filename, self.parser,
	289	'application/zip', self.backup, self.add2archive)
	290	if not czf.is_clean():
	291	return False
	292	else:
	293	return True
	294
	295	def get_meta(self):
	296	'''
	297	Return a dict with all the meta of the file
	298	'''
	299	zipin = zipfile.ZipFile(self.filename, 'r')
	300	metadata = {}
	301	for item in zipin.namelist():
	302	if item.startswith('docProps/'):
	303	metadata[item] = 'harmful content'
	304	zipin.close()
	305	return metadata