summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitlab-ci.yml2
-rw-r--r--README.md2
-rw-r--r--libmat2/archive.py9
-rw-r--r--libmat2/office.py86
-rw-r--r--tests/data/narrated_powerpoint_presentation.pptxbin0 -> 4383613 bytes
-rw-r--r--tests/test_libmat2.py10
6 files changed, 103 insertions, 6 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4ad5c98..38cbe20 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -16,7 +16,7 @@ linting:bandit:
16 script: # TODO: remove B405 and B314 16 script: # TODO: remove B405 and B314
17 - bandit ./mat2 --format txt --skip B101 17 - bandit ./mat2 --format txt --skip B101
18 - bandit -r ./nautilus/ --format txt --skip B101 18 - bandit -r ./nautilus/ --format txt --skip B101
19 - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108 19 - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108,B311
20 20
21linting:codespell: 21linting:codespell:
22 image: $CONTAINER_REGISTRY:linting 22 image: $CONTAINER_REGISTRY:linting
diff --git a/README.md b/README.md
index 5f902fe..c81daff 100644
--- a/README.md
+++ b/README.md
@@ -152,6 +152,8 @@ Copyright 2016 Marie-Rose for mat2's logo
152The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3, 152The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
153and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx 153and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
154 154
155The `narrated_powerpoint_presentation.pptx` file is in the public domain.
156
155# Thanks 157# Thanks
156 158
157mat2 wouldn't exist without: 159mat2 wouldn't exist without:
diff --git a/libmat2/archive.py b/libmat2/archive.py
index de80a35..f6db491 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -82,6 +82,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
82 # pylint: disable=unused-argument,no-self-use 82 # pylint: disable=unused-argument,no-self-use
83 return {} # pragma: no cover 83 return {} # pragma: no cover
84 84
85 def _final_checks(self) -> bool:
86 """ This method is invoked after the file has been cleaned,
87 allowing to run final verifications.
88 """
89 # pylint: disable=unused-argument,no-self-use
90 return True
91
85 @staticmethod 92 @staticmethod
86 @abc.abstractmethod 93 @abc.abstractmethod
87 def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: 94 def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
@@ -223,6 +230,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
223 if abort: 230 if abort:
224 os.remove(self.output_filename) 231 os.remove(self.output_filename)
225 return False 232 return False
233 if not self._final_checks():
234 return False # pragma: no cover
226 return True 235 return True
227 236
228 237
diff --git a/libmat2/office.py b/libmat2/office.py
index 7f70b72..d122fc8 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -1,3 +1,4 @@
1import random
1import uuid 2import uuid
2import logging 3import logging
3import os 4import os
@@ -75,6 +76,14 @@ class MSOfficeParser(ZipParser):
75 def __init__(self, filename): 76 def __init__(self, filename):
76 super().__init__(filename) 77 super().__init__(filename)
77 78
79 # MSOffice documents are using various counters for cross-references,
80 # we collect them all, to make sure that they're effectively counters,
81 # and not unique id used for fingerprinting.
82 self.__counters = {
83 'cNvPr': set(),
84 'rid': set(),
85 }
86
78 self.files_to_keep = set(map(re.compile, { # type: ignore 87 self.files_to_keep = set(map(re.compile, { # type: ignore
79 r'^\[Content_Types\]\.xml$', 88 r'^\[Content_Types\]\.xml$',
80 r'^_rels/\.rels$', 89 r'^_rels/\.rels$',
@@ -84,8 +93,14 @@ class MSOfficeParser(ZipParser):
84 r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$', 93 r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
85 r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$', 94 r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
86 r'^(?:word|ppt)/tableStyles\.xml$', 95 r'^(?:word|ppt)/tableStyles\.xml$',
96 r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
97 r'^ppt/slides/slide[0-9]*\.xml$',
87 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx 98 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
88 r'^(?:word|ppt)/stylesWithEffects\.xml$', 99 r'^(?:word|ppt)/stylesWithEffects\.xml$',
100 r'^ppt/presentation\.xml$',
101 # TODO: check if p:bgRef can be randomized
102 r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
103 r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
89 })) 104 }))
90 self.files_to_omit = set(map(re.compile, { # type: ignore 105 self.files_to_omit = set(map(re.compile, { # type: ignore
91 r'^customXml/', 106 r'^customXml/',
@@ -95,6 +110,7 @@ class MSOfficeParser(ZipParser):
95 r'^(?:word|ppt)/theme', 110 r'^(?:word|ppt)/theme',
96 r'^(?:word|ppt)/people\.xml$', 111 r'^(?:word|ppt)/people\.xml$',
97 r'^(?:word|ppt)/numbering\.xml$', 112 r'^(?:word|ppt)/numbering\.xml$',
113 r'^(?:word|ppt)/tags/',
98 # View properties like view mode, last viewed slide etc 114 # View properties like view mode, last viewed slide etc
99 r'^(?:word|ppt)/viewProps\.xml$', 115 r'^(?:word|ppt)/viewProps\.xml$',
100 # Additional presentation-wide properties like printing properties, 116 # Additional presentation-wide properties like printing properties,
@@ -146,7 +162,7 @@ class MSOfficeParser(ZipParser):
146 """ 162 """
147 try: 163 try:
148 tree, namespace = _parse_xml(full_path) 164 tree, namespace = _parse_xml(full_path)
149 except ET.ParseError as e: 165 except ET.ParseError as e: # pragma: no cover
150 logging.error("Unable to parse %s: %s", full_path, e) 166 logging.error("Unable to parse %s: %s", full_path, e)
151 return False 167 return False
152 168
@@ -206,7 +222,7 @@ class MSOfficeParser(ZipParser):
206 def __remove_revisions(full_path: str) -> bool: 222 def __remove_revisions(full_path: str) -> bool:
207 try: 223 try:
208 tree, namespace = _parse_xml(full_path) 224 tree, namespace = _parse_xml(full_path)
209 except ET.ParseError as e: 225 except ET.ParseError as e: # pragma: no cover
210 logging.error("Unable to parse %s: %s", full_path, e) 226 logging.error("Unable to parse %s: %s", full_path, e)
211 return False 227 return False
212 228
@@ -272,14 +288,71 @@ class MSOfficeParser(ZipParser):
272 tree.write(full_path, xml_declaration=True) 288 tree.write(full_path, xml_declaration=True)
273 return True 289 return True
274 290
291 def _final_checks(self) -> bool:
292 for k, v in self.__counters.items():
293 if v and len(v) != max(v):
294 # TODO: make this an error and return False
295 # once the ability to correct the counters is implemented
296 logging.warning("%s contains invalid %s: %s", self.filename, k, v)
297 return True
298 return True
299
300 def __collect_counters(self, full_path: str):
301 with open(full_path, encoding='utf-8') as f:
302 content = f.read()
303 # "relationship Id"
304 for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content):
305 self.__counters['rid'].add(int(i))
306 # "connector for Non-visual property"
307 for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
308 self.__counters['cNvPr'].add(int(i))
309
310
311 @staticmethod
312 def __randomize_creationId(full_path: str) -> bool:
313 try:
314 tree, namespace = _parse_xml(full_path)
315 except ET.ParseError as e: # pragma: no cover
316 logging.error("Unable to parse %s: %s", full_path, e)
317 return False
318
319 if 'p14' not in namespace.keys():
320 return True # pragma: no cover
321
322 for item in tree.iterfind('.//p14:creationId', namespace):
323 item.set('val', '%s' % random.randint(0, 2**32))
324 tree.write(full_path, xml_declaration=True)
325 return True
326
327 @staticmethod
328 def __randomize_sldMasterId(full_path: str) -> bool:
329 try:
330 tree, namespace = _parse_xml(full_path)
331 except ET.ParseError as e: # pragma: no cover
332 logging.error("Unable to parse %s: %s", full_path, e)
333 return False
334
335 if 'p' not in namespace.keys():
336 return True # pragma: no cover
337
338 for item in tree.iterfind('.//p:sldMasterId', namespace):
339 item.set('id', '%s' % random.randint(0, 2**32))
340 tree.write(full_path, xml_declaration=True)
341 return True
342
275 def _specific_cleanup(self, full_path: str) -> bool: 343 def _specific_cleanup(self, full_path: str) -> bool:
276 # pylint: disable=too-many-return-statements 344 # pylint: disable=too-many-return-statements,too-many-branches
277 if os.stat(full_path).st_size == 0: # Don't process empty files 345 if os.stat(full_path).st_size == 0: # Don't process empty files
278 return True 346 return True
279 347
280 if not full_path.endswith('.xml'): 348 if not full_path.endswith('.xml'):
281 return True 349 return True
282 350
351 if self.__randomize_creationId(full_path) is False:
352 return False
353
354 self.__collect_counters(full_path)
355
283 if full_path.endswith('/[Content_Types].xml'): 356 if full_path.endswith('/[Content_Types].xml'):
284 # this file contains references to files that we might 357 # this file contains references to files that we might
285 # remove, and MS Office doesn't like dangling references 358 # remove, and MS Office doesn't like dangling references
@@ -288,7 +361,7 @@ class MSOfficeParser(ZipParser):
288 elif full_path.endswith('/word/document.xml'): 361 elif full_path.endswith('/word/document.xml'):
289 # this file contains the revisions 362 # this file contains the revisions
290 if self.__remove_revisions(full_path) is False: 363 if self.__remove_revisions(full_path) is False:
291 return False 364 return False # pragma: no cover
292 elif full_path.endswith('/docProps/app.xml'): 365 elif full_path.endswith('/docProps/app.xml'):
293 # This file must be present and valid, 366 # This file must be present and valid,
294 # so we're removing as much as we can. 367 # so we're removing as much as we can.
@@ -310,9 +383,12 @@ class MSOfficeParser(ZipParser):
310 f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>') 383 f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
311 uid = str(uuid.uuid4()).encode('utf-8') 384 uid = str(uuid.uuid4()).encode('utf-8')
312 f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid) 385 f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)
386 elif full_path.endswith('ppt/presentation.xml'):
387 if self.__randomize_sldMasterId(full_path) is False:
388 return False # pragma: no cover
313 389
314 if self.__remove_rsid(full_path) is False: 390 if self.__remove_rsid(full_path) is False:
315 return False 391 return False # pragma: no cover
316 392
317 if self.__remove_nsid(full_path) is False: 393 if self.__remove_nsid(full_path) is False:
318 return False # pragma: no cover 394 return False # pragma: no cover
diff --git a/tests/data/narrated_powerpoint_presentation.pptx b/tests/data/narrated_powerpoint_presentation.pptx
new file mode 100644
index 0000000..ef04132
--- /dev/null
+++ b/tests/data/narrated_powerpoint_presentation.pptx
Binary files differ
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 9e208ec..a6c3a9a 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -777,3 +777,13 @@ class TestNoSandbox(unittest.TestCase):
777 os.remove('./tests/data/clean.png') 777 os.remove('./tests/data/clean.png')
778 os.remove('./tests/data/clean.cleaned.png') 778 os.remove('./tests/data/clean.cleaned.png')
779 os.remove('./tests/data/clean.cleaned.cleaned.png') 779 os.remove('./tests/data/clean.cleaned.cleaned.png')
780
781class TestComplexOfficeFiles(unittest.TestCase):
782 def test_complex_pptx(self):
783 target = './tests/data/clean.pptx'
784 shutil.copy('./tests/data/narrated_powerpoint_presentation.pptx', target)
785 p = office.MSOfficeParser(target)
786 self.assertTrue(p.remove_all())
787
788 os.remove(target)
789 os.remove(p.output_filename)