diff options
| -rw-r--r-- | .gitlab-ci.yml | 2 | ||||
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | libmat2/archive.py | 9 | ||||
| -rw-r--r-- | libmat2/office.py | 86 | ||||
| -rw-r--r-- | tests/data/narrated_powerpoint_presentation.pptx | bin | 0 -> 4383613 bytes | |||
| -rw-r--r-- | tests/test_libmat2.py | 10 |
6 files changed, 103 insertions, 6 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4ad5c98..38cbe20 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml | |||
| @@ -16,7 +16,7 @@ linting:bandit: | |||
| 16 | script: # TODO: remove B405 and B314 | 16 | script: # TODO: remove B405 and B314 |
| 17 | - bandit ./mat2 --format txt --skip B101 | 17 | - bandit ./mat2 --format txt --skip B101 |
| 18 | - bandit -r ./nautilus/ --format txt --skip B101 | 18 | - bandit -r ./nautilus/ --format txt --skip B101 |
| 19 | - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108 | 19 | - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108,B311 |
| 20 | 20 | ||
| 21 | linting:codespell: | 21 | linting:codespell: |
| 22 | image: $CONTAINER_REGISTRY:linting | 22 | image: $CONTAINER_REGISTRY:linting |
| @@ -152,6 +152,8 @@ Copyright 2016 Marie-Rose for mat2's logo | |||
| 152 | The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3, | 152 | The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3, |
| 153 | and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx | 153 | and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx |
| 154 | 154 | ||
| 155 | The `narrated_powerpoint_presentation.pptx` file is in the public domain. | ||
| 156 | |||
| 155 | # Thanks | 157 | # Thanks |
| 156 | 158 | ||
| 157 | mat2 wouldn't exist without: | 159 | mat2 wouldn't exist without: |
diff --git a/libmat2/archive.py b/libmat2/archive.py index de80a35..f6db491 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py | |||
| @@ -82,6 +82,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 82 | # pylint: disable=unused-argument,no-self-use | 82 | # pylint: disable=unused-argument,no-self-use |
| 83 | return {} # pragma: no cover | 83 | return {} # pragma: no cover |
| 84 | 84 | ||
| 85 | def _final_checks(self) -> bool: | ||
| 86 | """ This method is invoked after the file has been cleaned, | ||
| 87 | allowing to run final verifications. | ||
| 88 | """ | ||
| 89 | # pylint: disable=unused-argument,no-self-use | ||
| 90 | return True | ||
| 91 | |||
| 85 | @staticmethod | 92 | @staticmethod |
| 86 | @abc.abstractmethod | 93 | @abc.abstractmethod |
| 87 | def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: | 94 | def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: |
| @@ -223,6 +230,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): | |||
| 223 | if abort: | 230 | if abort: |
| 224 | os.remove(self.output_filename) | 231 | os.remove(self.output_filename) |
| 225 | return False | 232 | return False |
| 233 | if not self._final_checks(): | ||
| 234 | return False # pragma: no cover | ||
| 226 | return True | 235 | return True |
| 227 | 236 | ||
| 228 | 237 | ||
diff --git a/libmat2/office.py b/libmat2/office.py index 7f70b72..d122fc8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | import random | ||
| 1 | import uuid | 2 | import uuid |
| 2 | import logging | 3 | import logging |
| 3 | import os | 4 | import os |
| @@ -75,6 +76,14 @@ class MSOfficeParser(ZipParser): | |||
| 75 | def __init__(self, filename): | 76 | def __init__(self, filename): |
| 76 | super().__init__(filename) | 77 | super().__init__(filename) |
| 77 | 78 | ||
| 79 | # MSOffice documents are using various counters for cross-references, | ||
| 80 | # we collect them all, to make sure that they're effectively counters, | ||
| 81 | # and not unique id used for fingerprinting. | ||
| 82 | self.__counters = { | ||
| 83 | 'cNvPr': set(), | ||
| 84 | 'rid': set(), | ||
| 85 | } | ||
| 86 | |||
| 78 | self.files_to_keep = set(map(re.compile, { # type: ignore | 87 | self.files_to_keep = set(map(re.compile, { # type: ignore |
| 79 | r'^\[Content_Types\]\.xml$', | 88 | r'^\[Content_Types\]\.xml$', |
| 80 | r'^_rels/\.rels$', | 89 | r'^_rels/\.rels$', |
| @@ -84,8 +93,14 @@ class MSOfficeParser(ZipParser): | |||
| 84 | r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$', | 93 | r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$', |
| 85 | r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$', | 94 | r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$', |
| 86 | r'^(?:word|ppt)/tableStyles\.xml$', | 95 | r'^(?:word|ppt)/tableStyles\.xml$', |
| 96 | r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$', | ||
| 97 | r'^ppt/slides/slide[0-9]*\.xml$', | ||
| 87 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | 98 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx |
| 88 | r'^(?:word|ppt)/stylesWithEffects\.xml$', | 99 | r'^(?:word|ppt)/stylesWithEffects\.xml$', |
| 100 | r'^ppt/presentation\.xml$', | ||
| 101 | # TODO: check if p:bgRef can be randomized | ||
| 102 | r'^ppt/slideMasters/slideMaster[0-9]+\.xml', | ||
| 103 | r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels', | ||
| 89 | })) | 104 | })) |
| 90 | self.files_to_omit = set(map(re.compile, { # type: ignore | 105 | self.files_to_omit = set(map(re.compile, { # type: ignore |
| 91 | r'^customXml/', | 106 | r'^customXml/', |
| @@ -95,6 +110,7 @@ class MSOfficeParser(ZipParser): | |||
| 95 | r'^(?:word|ppt)/theme', | 110 | r'^(?:word|ppt)/theme', |
| 96 | r'^(?:word|ppt)/people\.xml$', | 111 | r'^(?:word|ppt)/people\.xml$', |
| 97 | r'^(?:word|ppt)/numbering\.xml$', | 112 | r'^(?:word|ppt)/numbering\.xml$', |
| 113 | r'^(?:word|ppt)/tags/', | ||
| 98 | # View properties like view mode, last viewed slide etc | 114 | # View properties like view mode, last viewed slide etc |
| 99 | r'^(?:word|ppt)/viewProps\.xml$', | 115 | r'^(?:word|ppt)/viewProps\.xml$', |
| 100 | # Additional presentation-wide properties like printing properties, | 116 | # Additional presentation-wide properties like printing properties, |
| @@ -146,7 +162,7 @@ class MSOfficeParser(ZipParser): | |||
| 146 | """ | 162 | """ |
| 147 | try: | 163 | try: |
| 148 | tree, namespace = _parse_xml(full_path) | 164 | tree, namespace = _parse_xml(full_path) |
| 149 | except ET.ParseError as e: | 165 | except ET.ParseError as e: # pragma: no cover |
| 150 | logging.error("Unable to parse %s: %s", full_path, e) | 166 | logging.error("Unable to parse %s: %s", full_path, e) |
| 151 | return False | 167 | return False |
| 152 | 168 | ||
| @@ -206,7 +222,7 @@ class MSOfficeParser(ZipParser): | |||
| 206 | def __remove_revisions(full_path: str) -> bool: | 222 | def __remove_revisions(full_path: str) -> bool: |
| 207 | try: | 223 | try: |
| 208 | tree, namespace = _parse_xml(full_path) | 224 | tree, namespace = _parse_xml(full_path) |
| 209 | except ET.ParseError as e: | 225 | except ET.ParseError as e: # pragma: no cover |
| 210 | logging.error("Unable to parse %s: %s", full_path, e) | 226 | logging.error("Unable to parse %s: %s", full_path, e) |
| 211 | return False | 227 | return False |
| 212 | 228 | ||
| @@ -272,14 +288,71 @@ class MSOfficeParser(ZipParser): | |||
| 272 | tree.write(full_path, xml_declaration=True) | 288 | tree.write(full_path, xml_declaration=True) |
| 273 | return True | 289 | return True |
| 274 | 290 | ||
| 291 | def _final_checks(self) -> bool: | ||
| 292 | for k, v in self.__counters.items(): | ||
| 293 | if v and len(v) != max(v): | ||
| 294 | # TODO: make this an error and return False | ||
| 295 | # once the ability to correct the counters is implemented | ||
| 296 | logging.warning("%s contains invalid %s: %s", self.filename, k, v) | ||
| 297 | return True | ||
| 298 | return True | ||
| 299 | |||
| 300 | def __collect_counters(self, full_path: str): | ||
| 301 | with open(full_path, encoding='utf-8') as f: | ||
| 302 | content = f.read() | ||
| 303 | # "relationship Id" | ||
| 304 | for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content): | ||
| 305 | self.__counters['rid'].add(int(i)) | ||
| 306 | # "connector for Non-visual property" | ||
| 307 | for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content): | ||
| 308 | self.__counters['cNvPr'].add(int(i)) | ||
| 309 | |||
| 310 | |||
| 311 | @staticmethod | ||
| 312 | def __randomize_creationId(full_path: str) -> bool: | ||
| 313 | try: | ||
| 314 | tree, namespace = _parse_xml(full_path) | ||
| 315 | except ET.ParseError as e: # pragma: no cover | ||
| 316 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 317 | return False | ||
| 318 | |||
| 319 | if 'p14' not in namespace.keys(): | ||
| 320 | return True # pragma: no cover | ||
| 321 | |||
| 322 | for item in tree.iterfind('.//p14:creationId', namespace): | ||
| 323 | item.set('val', '%s' % random.randint(0, 2**32)) | ||
| 324 | tree.write(full_path, xml_declaration=True) | ||
| 325 | return True | ||
| 326 | |||
| 327 | @staticmethod | ||
| 328 | def __randomize_sldMasterId(full_path: str) -> bool: | ||
| 329 | try: | ||
| 330 | tree, namespace = _parse_xml(full_path) | ||
| 331 | except ET.ParseError as e: # pragma: no cover | ||
| 332 | logging.error("Unable to parse %s: %s", full_path, e) | ||
| 333 | return False | ||
| 334 | |||
| 335 | if 'p' not in namespace.keys(): | ||
| 336 | return True # pragma: no cover | ||
| 337 | |||
| 338 | for item in tree.iterfind('.//p:sldMasterId', namespace): | ||
| 339 | item.set('id', '%s' % random.randint(0, 2**32)) | ||
| 340 | tree.write(full_path, xml_declaration=True) | ||
| 341 | return True | ||
| 342 | |||
| 275 | def _specific_cleanup(self, full_path: str) -> bool: | 343 | def _specific_cleanup(self, full_path: str) -> bool: |
| 276 | # pylint: disable=too-many-return-statements | 344 | # pylint: disable=too-many-return-statements,too-many-branches |
| 277 | if os.stat(full_path).st_size == 0: # Don't process empty files | 345 | if os.stat(full_path).st_size == 0: # Don't process empty files |
| 278 | return True | 346 | return True |
| 279 | 347 | ||
| 280 | if not full_path.endswith('.xml'): | 348 | if not full_path.endswith('.xml'): |
| 281 | return True | 349 | return True |
| 282 | 350 | ||
| 351 | if self.__randomize_creationId(full_path) is False: | ||
| 352 | return False | ||
| 353 | |||
| 354 | self.__collect_counters(full_path) | ||
| 355 | |||
| 283 | if full_path.endswith('/[Content_Types].xml'): | 356 | if full_path.endswith('/[Content_Types].xml'): |
| 284 | # this file contains references to files that we might | 357 | # this file contains references to files that we might |
| 285 | # remove, and MS Office doesn't like dangling references | 358 | # remove, and MS Office doesn't like dangling references |
| @@ -288,7 +361,7 @@ class MSOfficeParser(ZipParser): | |||
| 288 | elif full_path.endswith('/word/document.xml'): | 361 | elif full_path.endswith('/word/document.xml'): |
| 289 | # this file contains the revisions | 362 | # this file contains the revisions |
| 290 | if self.__remove_revisions(full_path) is False: | 363 | if self.__remove_revisions(full_path) is False: |
| 291 | return False | 364 | return False # pragma: no cover |
| 292 | elif full_path.endswith('/docProps/app.xml'): | 365 | elif full_path.endswith('/docProps/app.xml'): |
| 293 | # This file must be present and valid, | 366 | # This file must be present and valid, |
| 294 | # so we're removing as much as we can. | 367 | # so we're removing as much as we can. |
| @@ -310,9 +383,12 @@ class MSOfficeParser(ZipParser): | |||
| 310 | f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>') | 383 | f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>') |
| 311 | uid = str(uuid.uuid4()).encode('utf-8') | 384 | uid = str(uuid.uuid4()).encode('utf-8') |
| 312 | f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid) | 385 | f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid) |
| 386 | elif full_path.endswith('ppt/presentation.xml'): | ||
| 387 | if self.__randomize_sldMasterId(full_path) is False: | ||
| 388 | return False # pragma: no cover | ||
| 313 | 389 | ||
| 314 | if self.__remove_rsid(full_path) is False: | 390 | if self.__remove_rsid(full_path) is False: |
| 315 | return False | 391 | return False # pragma: no cover |
| 316 | 392 | ||
| 317 | if self.__remove_nsid(full_path) is False: | 393 | if self.__remove_nsid(full_path) is False: |
| 318 | return False # pragma: no cover | 394 | return False # pragma: no cover |
diff --git a/tests/data/narrated_powerpoint_presentation.pptx b/tests/data/narrated_powerpoint_presentation.pptx new file mode 100644 index 0000000..ef04132 --- /dev/null +++ b/tests/data/narrated_powerpoint_presentation.pptx | |||
| Binary files differ | |||
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 9e208ec..a6c3a9a 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py | |||
| @@ -777,3 +777,13 @@ class TestNoSandbox(unittest.TestCase): | |||
| 777 | os.remove('./tests/data/clean.png') | 777 | os.remove('./tests/data/clean.png') |
| 778 | os.remove('./tests/data/clean.cleaned.png') | 778 | os.remove('./tests/data/clean.cleaned.png') |
| 779 | os.remove('./tests/data/clean.cleaned.cleaned.png') | 779 | os.remove('./tests/data/clean.cleaned.cleaned.png') |
| 780 | |||
| 781 | class TestComplexOfficeFiles(unittest.TestCase): | ||
| 782 | def test_complex_pptx(self): | ||
| 783 | target = './tests/data/clean.pptx' | ||
| 784 | shutil.copy('./tests/data/narrated_powerpoint_presentation.pptx', target) | ||
| 785 | p = office.MSOfficeParser(target) | ||
| 786 | self.assertTrue(p.remove_all()) | ||
| 787 | |||
| 788 | os.remove(target) | ||
| 789 | os.remove(p.output_filename) | ||
