1 files changed, 150 insertions, 0 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
new file mode 100644
index 0000000..749fc7d
--- /dev/null
+++ b/libmat2/office.py
@@ -0,0 +1,150 @@
+import os
+import re
+import shutil
+import tempfile
+import datetime
+import zipfile
+from . import abstract, parser_factory
+class ArchiveBasedAbstractParser(abstract.AbstractParser):
+    def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
+        zipinfo.compress_type = zipfile.ZIP_DEFLATED
+        zipinfo.create_system = 3  # Linux
+        zipinfo.comment = b''
+        zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
+        return zipinfo
+    def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict:
+        metadata = {}
+        if zipinfo.create_system == 3:
+            #metadata['create_system'] = 'Linux'
+            pass
+        elif zipinfo.create_system == 2:
+            metadata['create_system'] = 'Windows'
+        else:
+            metadata['create_system'] = 'Weird'
+        if zipinfo.comment:
+            metadata['comment'] = zipinfo.comment
+        if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
+            metadata['date_time'] = datetime.datetime(*zipinfo.date_time)
+        return metadata
+    def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
+                             zin: zipfile.ZipFile, zout: zipfile.ZipFile):
+        zin.extract(member=item, path=temp_folder)
+        tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
+        if not tmp_parser:
+            print("%s's format (%s) isn't supported" % (item.filename, mtype))
+            return
+        tmp_parser.remove_all()
+        zinfo = zipfile.ZipInfo(item.filename)
+        clean_zinfo = self._clean_zipinfo(zinfo)
+        with open(tmp_parser.output_filename, 'rb') as f:
+            zout.writestr(clean_zinfo, f.read())
+class MSOfficeParser(ArchiveBasedAbstractParser):
+    mimetypes = {
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+        'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+    }
+    files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
+    def get_meta(self):
+        """
+        Yes, I know that parsing xml with regexp ain't pretty,
+        be my guest and fix it if you want.
+        """
+        metadata = {}
+        zipin = zipfile.ZipFile(self.filename)
+        for item in zipin.infolist():
+            if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
+                content = zipin.read(item).decode('utf-8')
+                for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
+                    metadata[key] = value
+                if not metadata:  # better safe than sorry
+                    metadata[item] = 'harmful content'
+            metadata = {**metadata, **self._get_zipinfo_meta(item)}
+        zipin.close()
+        return metadata
+    def remove_all(self):
+        zin = zipfile.ZipFile(self.filename, 'r')
+        zout = zipfile.ZipFile(self.output_filename, 'w')
+        temp_folder = tempfile.mkdtemp()
+        for item in zin.infolist():
+            if item.filename[-1] == '/':
+                continue  # `is_dir` is added in Python3.6
+            elif item.filename.startswith('docProps/'):
+                if not item.filename.endswith('.rels'):
+                    continue  # don't keep metadata files
+            if item.filename in self.files_to_keep:
+                item = self._clean_zipinfo(item)
+                zout.writestr(item, zin.read(item))
+                continue
+            self._clean_internal_file(item, temp_folder, zin, zout)
+        shutil.rmtree(temp_folder)
+        zout.close()
+        zin.close()
+        return True
+class LibreOfficeParser(ArchiveBasedAbstractParser):
+    mimetypes = {
+        'application/vnd.oasis.opendocument.text',
+        'application/vnd.oasis.opendocument.spreadsheet',
+        'application/vnd.oasis.opendocument.presentation',
+        'application/vnd.oasis.opendocument.graphics',
+        'application/vnd.oasis.opendocument.chart',
+        'application/vnd.oasis.opendocument.formula',
+        'application/vnd.oasis.opendocument.image',
+    }
+    def get_meta(self):
+        """
+        Yes, I know that parsing xml with regexp ain't pretty,
+        be my guest and fix it if you want.
+        """
+        metadata = {}
+        zipin = zipfile.ZipFile(self.filename)
+        for item in zipin.infolist():
+            if item.filename == 'meta.xml':
+                content = zipin.read(item).decode('utf-8')
+                for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I):
+                    metadata[key] = value
+                if not metadata:  # better safe than sorry
+                    metadata[item] = 'harmful content'
+            metadata = {**metadata, **self._get_zipinfo_meta(item)}
+        zipin.close()
+        return metadata
+    def remove_all(self):
+        zin = zipfile.ZipFile(self.filename, 'r')
+        zout = zipfile.ZipFile(self.output_filename, 'w')
+        temp_folder = tempfile.mkdtemp()
+        for item in zin.infolist():
+            if item.filename[-1] == '/':
+                continue  # `is_dir` is added in Python3.6
+            elif item.filename == 'meta.xml':
+                continue  # don't keep metadata files
+            self._clean_internal_file(item, temp_folder, zin, zout)
+        shutil.rmtree(temp_folder)
+        zout.close()
+        zin.close()
+        return True

diff --git a/libmat2/office.py b/libmat2/office.py new file mode 100644 index 0000000..749fc7d --- /dev/null +++ b/libmat2/office.py
@@ -0,0 +1,150 @@
	1	import os
	2	import re
	3	import shutil
	4	import tempfile
	5	import datetime
	6	import zipfile
	7
	8	from . import abstract, parser_factory
	9
	10
	11	class ArchiveBasedAbstractParser(abstract.AbstractParser):
	12	def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
	13	zipinfo.compress_type = zipfile.ZIP_DEFLATED
	14	zipinfo.create_system = 3 # Linux
	15	zipinfo.comment = b''
	16	zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
	17	return zipinfo
	18
	19	def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict:
	20	metadata = {}
	21	if zipinfo.create_system == 3:
	22	#metadata['create_system'] = 'Linux'
	23	pass
	24	elif zipinfo.create_system == 2:
	25	metadata['create_system'] = 'Windows'
	26	else:
	27	metadata['create_system'] = 'Weird'
	28
	29	if zipinfo.comment:
	30	metadata['comment'] = zipinfo.comment
	31
	32	if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
	33	metadata['date_time'] = datetime.datetime(*zipinfo.date_time)
	34
	35	return metadata
	36
	37
	38	def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
	39	zin: zipfile.ZipFile, zout: zipfile.ZipFile):
	40	zin.extract(member=item, path=temp_folder)
	41	tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
	42	if not tmp_parser:
	43	print("%s's format (%s) isn't supported" % (item.filename, mtype))
	44	return
	45	tmp_parser.remove_all()
	46	zinfo = zipfile.ZipInfo(item.filename)
	47	clean_zinfo = self._clean_zipinfo(zinfo)
	48	with open(tmp_parser.output_filename, 'rb') as f:
	49	zout.writestr(clean_zinfo, f.read())
	50
	51
	52	class MSOfficeParser(ArchiveBasedAbstractParser):
	53	mimetypes = {
	54	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
	55	'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
	56	'application/vnd.openxmlformats-officedocument.presentationml.presentation'
	57	}
	58	files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
	59
	60	def get_meta(self):
	61	"""
	62	Yes, I know that parsing xml with regexp ain't pretty,
	63	be my guest and fix it if you want.
	64	"""
	65	metadata = {}
	66	zipin = zipfile.ZipFile(self.filename)
	67	for item in zipin.infolist():
	68	if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
	69	content = zipin.read(item).decode('utf-8')
	70	for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
	71	metadata[key] = value
	72	if not metadata: # better safe than sorry
	73	metadata[item] = 'harmful content'
	74
	75	metadata = {metadata, self._get_zipinfo_meta(item)}
	76	zipin.close()
	77	return metadata
	78
	79
	80	def remove_all(self):
	81	zin = zipfile.ZipFile(self.filename, 'r')
	82	zout = zipfile.ZipFile(self.output_filename, 'w')
	83	temp_folder = tempfile.mkdtemp()
	84
	85	for item in zin.infolist():
	86	if item.filename[-1] == '/':
	87	continue # `is_dir` is added in Python3.6
	88	elif item.filename.startswith('docProps/'):
	89	if not item.filename.endswith('.rels'):
	90	continue # don't keep metadata files
	91	if item.filename in self.files_to_keep:
	92	item = self._clean_zipinfo(item)
	93	zout.writestr(item, zin.read(item))
	94	continue
	95
	96	self._clean_internal_file(item, temp_folder, zin, zout)
	97
	98	shutil.rmtree(temp_folder)
	99	zout.close()
	100	zin.close()
	101	return True
	102
	103
	104
	105	class LibreOfficeParser(ArchiveBasedAbstractParser):
	106	mimetypes = {
	107	'application/vnd.oasis.opendocument.text',
	108	'application/vnd.oasis.opendocument.spreadsheet',
	109	'application/vnd.oasis.opendocument.presentation',
	110	'application/vnd.oasis.opendocument.graphics',
	111	'application/vnd.oasis.opendocument.chart',
	112	'application/vnd.oasis.opendocument.formula',
	113	'application/vnd.oasis.opendocument.image',
	114	}
	115
	116	def get_meta(self):
	117	"""
	118	Yes, I know that parsing xml with regexp ain't pretty,
	119	be my guest and fix it if you want.
	120	"""
	121	metadata = {}
	122	zipin = zipfile.ZipFile(self.filename)
	123	for item in zipin.infolist():
	124	if item.filename == 'meta.xml':
	125	content = zipin.read(item).decode('utf-8')
	126	for (key, value) in re.findall(r"<((?:meta\|dc\|cp).+?)>(.+)</\1>", content, re.I):
	127	metadata[key] = value
	128	if not metadata: # better safe than sorry
	129	metadata[item] = 'harmful content'
	130	metadata = {metadata, self._get_zipinfo_meta(item)}
	131	zipin.close()
	132	return metadata
	133
	134	def remove_all(self):
	135	zin = zipfile.ZipFile(self.filename, 'r')
	136	zout = zipfile.ZipFile(self.output_filename, 'w')
	137	temp_folder = tempfile.mkdtemp()
	138
	139	for item in zin.infolist():
	140	if item.filename[-1] == '/':
	141	continue # `is_dir` is added in Python3.6
	142	elif item.filename == 'meta.xml':
	143	continue # don't keep metadata files
	144
	145	self._clean_internal_file(item, temp_folder, zin, zout)
	146
	147	shutil.rmtree(temp_folder)
	148	zout.close()
	149	zin.close()
	150	return True