2 files changed, 63 insertions, 69 deletions
diff --git a/libmat2/harmless.py b/libmat2/harmless.py
index 54737a8..2878571 100644
--- a/libmat2/harmless.py
+++ b/libmat2/harmless.py
@@ -4,7 +4,7 @@ from . import abstract
 class HarmlessParser(abstract.AbstractParser):
    """ This is the parser for filetypes that do not contain metadata. """
-    mimetypes = {'application/xml', 'text/plain', 'text/xml', 'application/rdf+xml'}
+    mimetypes = {'text/plain', }
    def __init__(self, filename: str) -> None:
        super().__init__(filename)
diff --git a/libmat2/office.py b/libmat2/office.py
index 0791b07..fd3cdf4 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -4,17 +4,16 @@ import shutil
 import tempfile
 import datetime
 import zipfile
-from typing import Dict, Set
+from typing import Dict, Set, Pattern
 from . import abstract, parser_factory
-assert Set   # make pyflakes happy
 class ArchiveBasedAbstractParser(abstract.AbstractParser):
-    whitelist = set()  # type: Set[str]
+    files_to_keep : Set[str] = set()
+    files_to_omit : Set[Pattern] = set()
    def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
-        zipinfo.compress_type = zipfile.ZIP_DEFLATED
        zipinfo.create_system = 3  # Linux
        zipinfo.comment = b''
        zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
@@ -34,33 +33,51 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
            metadata['comment'] = zipinfo.comment  # type: ignore
        if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
-            metadata['date_time'] =str(datetime.datetime(*zipinfo.date_time))
+            metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
        return metadata
    def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
                             zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool:
-        output = ''
        zin.extract(member=item, path=temp_folder)
-        if item.filename not in self.whitelist:
+        full_path = os.path.join(temp_folder, item.filename)
-            full_path = os.path.join(temp_folder, item.filename)
+        tmp_parser, mtype = parser_factory.get_parser(full_path)  # type: ignore
-            tmp_parser, mtype = parser_factory.get_parser(full_path)  # type: ignore
+        if not tmp_parser:
-            if not tmp_parser:
+            zout.close()
-                zout.close()
+            os.remove(self.output_filename)
-                os.remove(self.output_filename)
+            print("%s's format (%s) isn't supported" % (item.filename, mtype))
-                print("%s's format (%s) isn't supported" % (item.filename, mtype))
+            return False
-                return False
+        tmp_parser.remove_all()
-            tmp_parser.remove_all()
-            output = tmp_parser.output_filename
-        else:
-            output = os.path.join(temp_folder, item.filename)
        zinfo = zipfile.ZipInfo(item.filename)  # type: ignore
        clean_zinfo = self._clean_zipinfo(zinfo)
-        with open(output, 'rb') as f:
+        with open(tmp_parser.output_filename, 'rb') as f:
            zout.writestr(clean_zinfo, f.read())
        return True
+    def remove_all(self) -> bool:
+        zin = zipfile.ZipFile(self.filename, 'r')
+        zout = zipfile.ZipFile(self.output_filename, 'w')
+        temp_folder = tempfile.mkdtemp()
+        for item in zin.infolist():
+            if item.filename[-1] == '/':  # `is_dir` is added in Python3.6
+                continue  # don't keep empty folders
+            elif item.filename in self.files_to_keep:
+                item = self._clean_zipinfo(item)
+                zout.writestr(item, zin.read(item))
+                continue
+            elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
+                continue
+            elif not self._clean_internal_file(item, temp_folder, zin, zout):
+                return False
+        shutil.rmtree(temp_folder)
+        zout.close()
+        zin.close()
+        return True
 class MSOfficeParser(ArchiveBasedAbstractParser):
    mimetypes = {
@@ -68,9 +85,20 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'application/vnd.openxmlformats-officedocument.presentationml.presentation'
    }
-    files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
+    files_to_keep = {
+            '[Content_Types].xml',
+            '_rels/.rels',
+            'word/_rels/document.xml.rels',
+            'word/document.xml',
+            'word/fontTable.xml',
+            'word/settings.xml',
+            'word/styles.xml',
+    }
+    files_to_omit = set(map(re.compile, {  # type: ignore
+            '^docProps/',
+    }))
-    def get_meta(self):
+    def get_meta(self) -> Dict[str, str]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
@@ -88,38 +116,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
                    pass
                if not metadata:  # better safe than sorry
                    metadata[item] = 'harmful content'
            for key, value in self._get_zipinfo_meta(item).items():
                metadata[key] = value
        zipin.close()
        return metadata
-    def remove_all(self):
-        zin = zipfile.ZipFile(self.filename, 'r')
-        zout = zipfile.ZipFile(self.output_filename, 'w')
-        temp_folder = tempfile.mkdtemp()
-        for item in zin.infolist():
-            if item.filename[-1] == '/':
-                continue  # `is_dir` is added in Python3.6
-            elif item.filename.startswith('docProps/'):
-                continue  # don't keep metadata files
-            if item.filename in self.files_to_keep:
-                item = self._clean_zipinfo(item)
-                zout.writestr(item, zin.read(item))
-                continue
-            if self._clean_internal_file(item, temp_folder, zin, zout) is False:
-                return False
-        shutil.rmtree(temp_folder)
-        zout.close()
-        zin.close()
-        return True
 class LibreOfficeParser(ArchiveBasedAbstractParser):
    mimetypes = {
        'application/vnd.oasis.opendocument.text',
@@ -130,10 +132,20 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
        'application/vnd.oasis.opendocument.formula',
        'application/vnd.oasis.opendocument.image',
    }
-    whitelist = {'mimetype', 'manifest.rdf'}
+    files_to_keep = {
+            'META-INF/manifest.xml',
+            'content.xml',
+            'manifest.rdf',
+            'mimetype',
+            'settings.xml',
+            'styles.xml',
+    }
+    files_to_omit = set(map(re.compile, {  # type: ignore
+            '^meta\.xml$',
+            '^Configurations2/',
+    }))
-    def get_meta(self):
+    def get_meta(self) -> Dict[str, str]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
@@ -156,21 +168,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
        zipin.close()
        return metadata
-    def remove_all(self):
-        zin = zipfile.ZipFile(self.filename, 'r')
-        zout = zipfile.ZipFile(self.output_filename, 'w')
-        temp_folder = tempfile.mkdtemp()
-        for item in zin.infolist():
-            if item.filename[-1] == '/':
-                continue  # `is_dir` is added in Python3.6
-            elif item.filename == 'meta.xml':
-                continue  # don't keep metadata files
-            if self._clean_internal_file(item, temp_folder, zin, zout) is False:
-                return False
-        shutil.rmtree(temp_folder)
-        zout.close()
-        zin.close()
-        return True

diff --git a/libmat2/harmless.py b/libmat2/harmless.py index 54737a8..2878571 100644 --- a/libmat2/harmless.py +++ b/libmat2/harmless.py
@@ -4,7 +4,7 @@ from . import abstract
4		4
5	class HarmlessParser(abstract.AbstractParser):	5	class HarmlessParser(abstract.AbstractParser):
6	""" This is the parser for filetypes that do not contain metadata. """	6	""" This is the parser for filetypes that do not contain metadata. """
7	mimetypes = {'application/xml', 'text/plain', 'text/xml', 'application/rdf+xml'}	7	mimetypes = {'text/plain', }
8		8
9	def __init__(self, filename: str) -> None:	9	def __init__(self, filename: str) -> None:
10	super().__init__(filename)	10	super().__init__(filename)


diff --git a/libmat2/office.py b/libmat2/office.py index 0791b07..fd3cdf4 100644 --- a/libmat2/office.py +++ b/libmat2/office.py
@@ -4,17 +4,16 @@ import shutil
4	import tempfile	4	import tempfile
5	import datetime	5	import datetime
6	import zipfile	6	import zipfile
7	from typing import Dict, Set	7	from typing import Dict, Set, Pattern
8		8
9	from . import abstract, parser_factory	9	from . import abstract, parser_factory
10		10
11	assert Set # make pyflakes happy
12		11
13	class ArchiveBasedAbstractParser(abstract.AbstractParser):	12	class ArchiveBasedAbstractParser(abstract.AbstractParser):
14	whitelist = set() # type: Set[str]	13	files_to_keep : Set[str] = set()
		14	files_to_omit : Set[Pattern] = set()
15		15
16	def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:	16	def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
17	zipinfo.compress_type = zipfile.ZIP_DEFLATED
18	zipinfo.create_system = 3 # Linux	17	zipinfo.create_system = 3 # Linux
19	zipinfo.comment = b''	18	zipinfo.comment = b''
20	zipinfo.date_time = (1980, 1, 1, 0, 0, 0)	19	zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
@@ -34,33 +33,51 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
34	metadata['comment'] = zipinfo.comment # type: ignore	33	metadata['comment'] = zipinfo.comment # type: ignore
35		34
36	if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):	35	if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
37	metadata['date_time'] =str(datetime.datetime(*zipinfo.date_time))	36	metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
38		37
39	return metadata	38	return metadata
40		39
41		40
42	def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,	41	def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
43	zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool:	42	zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool:
44	output = ''
45	zin.extract(member=item, path=temp_folder)	43	zin.extract(member=item, path=temp_folder)
46	if item.filename not in self.whitelist:	44	full_path = os.path.join(temp_folder, item.filename)
47	full_path = os.path.join(temp_folder, item.filename)	45	tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
48	tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore	46	if not tmp_parser:
49	if not tmp_parser:	47	zout.close()
50	zout.close()	48	os.remove(self.output_filename)
51	os.remove(self.output_filename)	49	print("%s's format (%s) isn't supported" % (item.filename, mtype))
52	print("%s's format (%s) isn't supported" % (item.filename, mtype))	50	return False
53	return False	51	tmp_parser.remove_all()
54	tmp_parser.remove_all()	52
55	output = tmp_parser.output_filename
56	else:
57	output = os.path.join(temp_folder, item.filename)
58	zinfo = zipfile.ZipInfo(item.filename) # type: ignore	53	zinfo = zipfile.ZipInfo(item.filename) # type: ignore
59	clean_zinfo = self._clean_zipinfo(zinfo)	54	clean_zinfo = self._clean_zipinfo(zinfo)
60	with open(output, 'rb') as f:	55	with open(tmp_parser.output_filename, 'rb') as f:
61	zout.writestr(clean_zinfo, f.read())	56	zout.writestr(clean_zinfo, f.read())
62	return True	57	return True
63		58
		59	def remove_all(self) -> bool:
		60	zin = zipfile.ZipFile(self.filename, 'r')
		61	zout = zipfile.ZipFile(self.output_filename, 'w')
		62	temp_folder = tempfile.mkdtemp()
		63
		64	for item in zin.infolist():
		65	if item.filename[-1] == '/': # `is_dir` is added in Python3.6
		66	continue # don't keep empty folders
		67	elif item.filename in self.files_to_keep:
		68	item = self._clean_zipinfo(item)
		69	zout.writestr(item, zin.read(item))
		70	continue
		71	elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
		72	continue
		73	elif not self._clean_internal_file(item, temp_folder, zin, zout):
		74	return False
		75
		76	shutil.rmtree(temp_folder)
		77	zout.close()
		78	zin.close()
		79	return True
		80
64		81
65	class MSOfficeParser(ArchiveBasedAbstractParser):	82	class MSOfficeParser(ArchiveBasedAbstractParser):
66	mimetypes = {	83	mimetypes = {
@@ -68,9 +85,20 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
68	'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',	85	'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
69	'application/vnd.openxmlformats-officedocument.presentationml.presentation'	86	'application/vnd.openxmlformats-officedocument.presentationml.presentation'
70	}	87	}
71	files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}	88	files_to_keep = {
		89	'[Content_Types].xml',
		90	'_rels/.rels',
		91	'word/_rels/document.xml.rels',
		92	'word/document.xml',
		93	'word/fontTable.xml',
		94	'word/settings.xml',
		95	'word/styles.xml',
		96	}
		97	files_to_omit = set(map(re.compile, { # type: ignore
		98	'^docProps/',
		99	}))
72		100
73	def get_meta(self):	101	def get_meta(self) -> Dict[str, str]:
74	"""	102	"""
75	Yes, I know that parsing xml with regexp ain't pretty,	103	Yes, I know that parsing xml with regexp ain't pretty,
76	be my guest and fix it if you want.	104	be my guest and fix it if you want.
@@ -88,38 +116,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
88	pass	116	pass
89	if not metadata: # better safe than sorry	117	if not metadata: # better safe than sorry
90	metadata[item] = 'harmful content'	118	metadata[item] = 'harmful content'
91
92	for key, value in self._get_zipinfo_meta(item).items():	119	for key, value in self._get_zipinfo_meta(item).items():
93	metadata[key] = value	120	metadata[key] = value
94	zipin.close()	121	zipin.close()
95	return metadata	122	return metadata
96		123
97		124
98	def remove_all(self):
99	zin = zipfile.ZipFile(self.filename, 'r')
100	zout = zipfile.ZipFile(self.output_filename, 'w')
101	temp_folder = tempfile.mkdtemp()
102
103	for item in zin.infolist():
104	if item.filename[-1] == '/':
105	continue # `is_dir` is added in Python3.6
106	elif item.filename.startswith('docProps/'):
107	continue # don't keep metadata files
108	if item.filename in self.files_to_keep:
109	item = self._clean_zipinfo(item)
110	zout.writestr(item, zin.read(item))
111	continue
112
113	if self._clean_internal_file(item, temp_folder, zin, zout) is False:
114	return False
115
116	shutil.rmtree(temp_folder)
117	zout.close()
118	zin.close()
119	return True
120
121
122
123	class LibreOfficeParser(ArchiveBasedAbstractParser):	125	class LibreOfficeParser(ArchiveBasedAbstractParser):
124	mimetypes = {	126	mimetypes = {
125	'application/vnd.oasis.opendocument.text',	127	'application/vnd.oasis.opendocument.text',
@@ -130,10 +132,20 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
130	'application/vnd.oasis.opendocument.formula',	132	'application/vnd.oasis.opendocument.formula',
131	'application/vnd.oasis.opendocument.image',	133	'application/vnd.oasis.opendocument.image',
132	}	134	}
133	whitelist = {'mimetype', 'manifest.rdf'}	135	files_to_keep = {
134		136	'META-INF/manifest.xml',
		137	'content.xml',
		138	'manifest.rdf',
		139	'mimetype',
		140	'settings.xml',
		141	'styles.xml',
		142	}
		143	files_to_omit = set(map(re.compile, { # type: ignore
		144	'^meta\.xml$',
		145	'^Configurations2/',
		146	}))
135		147
136	def get_meta(self):	148	def get_meta(self) -> Dict[str, str]:
137	"""	149	"""
138	Yes, I know that parsing xml with regexp ain't pretty,	150	Yes, I know that parsing xml with regexp ain't pretty,
139	be my guest and fix it if you want.	151	be my guest and fix it if you want.
@@ -156,21 +168,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
156	zipin.close()	168	zipin.close()
157	return metadata	169	return metadata
158		170
159	def remove_all(self):
160	zin = zipfile.ZipFile(self.filename, 'r')
161	zout = zipfile.ZipFile(self.output_filename, 'w')
162	temp_folder = tempfile.mkdtemp()
163
164	for item in zin.infolist():
165	if item.filename[-1] == '/':
166	continue # `is_dir` is added in Python3.6
167	elif item.filename == 'meta.xml':
168	continue # don't keep metadata files
169
170	if self._clean_internal_file(item, temp_folder, zin, zout) is False:
171	return False
172
173	shutil.rmtree(temp_folder)
174	zout.close()
175	zin.close()
176	return True