diff options
| -rw-r--r-- | libmat2/office.py | 61 | ||||
| -rw-r--r-- | tests/data/office_revision_session_ids.docx | bin | 0 -> 12163 bytes | |||
| -rw-r--r-- | tests/test_deep_cleaning.py | 31 |
3 files changed, 87 insertions, 5 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 5c2c996..07bbbb9 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET # type: ignore | |||
| 8 | 8 | ||
| 9 | from .archive import ArchiveBasedAbstractParser | 9 | from .archive import ArchiveBasedAbstractParser |
| 10 | 10 | ||
| 11 | # pylint: disable=line-too-long | ||
| 12 | |||
| 11 | # Make pyflakes happy | 13 | # Make pyflakes happy |
| 12 | assert Set | 14 | assert Set |
| 13 | assert Pattern | 15 | assert Pattern |
| @@ -15,14 +17,12 @@ assert Pattern | |||
| 15 | def _parse_xml(full_path: str): | 17 | def _parse_xml(full_path: str): |
| 16 | """ This function parses XML, with namespace support. """ | 18 | """ This function parses XML, with namespace support. """ |
| 17 | 19 | ||
| 18 | cpt = 0 | ||
| 19 | namespace_map = dict() | 20 | namespace_map = dict() |
| 20 | for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): | 21 | for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): |
| 21 | # The ns[0-9]+ namespaces are reserved for interal usage, so | 22 | # The ns[0-9]+ namespaces are reserved for interal usage, so |
| 22 | # we have to use an other nomenclature. | 23 | # we have to use an other nomenclature. |
| 23 | if re.match('^ns[0-9]+$', key): | 24 | if re.match('^ns[0-9]+$', key, re.I): #pragma: no cover |
| 24 | key = 'mat%d' % cpt | 25 | key = 'mat' + key[2:] |
| 25 | cpt += 1 | ||
| 26 | 26 | ||
| 27 | namespace_map[key] = value | 27 | namespace_map[key] = value |
| 28 | ET.register_namespace(key, value) | 28 | ET.register_namespace(key, value) |
| @@ -59,12 +59,57 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 59 | 'word/fontTable.xml', | 59 | 'word/fontTable.xml', |
| 60 | 'word/settings.xml', | 60 | 'word/settings.xml', |
| 61 | 'word/styles.xml', | 61 | 'word/styles.xml', |
| 62 | |||
| 63 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | ||
| 64 | 'word/stylesWithEffects.xml', | ||
| 62 | } | 65 | } |
| 63 | files_to_omit = set(map(re.compile, { # type: ignore | 66 | files_to_omit = set(map(re.compile, { # type: ignore |
| 67 | 'word/webSettings.xml', | ||
| 68 | 'word/theme', | ||
| 64 | '^docProps/', | 69 | '^docProps/', |
| 65 | })) | 70 | })) |
| 66 | 71 | ||
| 67 | @staticmethod | 72 | @staticmethod |
| 73 | def __remove_rsid(full_path: str) -> bool: | ||
| 74 | """ The method will remove "revision session ID". We're '}rsid' | ||
| 75 | instead of proper parsing, since rsid can have multiple forms, like | ||
| 76 | `rsidRDefault`, `rsidR`, `rsids`, … | ||
| 77 | |||
| 78 | We're removing rsid tags in two times, because we can't modify | ||
| 79 | the xml while we're iterating on it. | ||
| 80 | |||
| 81 | For more details, see | ||
| 82 | - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx | ||
| 83 | - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/ | ||
| 84 | """ | ||
| 85 | try: | ||
| 86 | tree, namespace = _parse_xml(full_path) | ||
| 87 | except ET.ParseError: | ||
| 88 | return False | ||
| 89 | |||
| 90 | # rsid, tags or attributes, are always under the `w` namespace | ||
| 91 | if 'w' not in namespace.keys(): | ||
| 92 | return True | ||
| 93 | |||
| 94 | parent_map = {c:p for p in tree.iter() for c in p} | ||
| 95 | |||
| 96 | elements_to_remove = list() | ||
| 97 | for item in tree.iterfind('.//', namespace): | ||
| 98 | if '}rsid' in item.tag.strip().lower(): # resi as tag | ||
| 99 | elements_to_remove.append(item) | ||
| 100 | continue | ||
| 101 | for key in list(item.attrib.keys()): # rsid as attribute | ||
| 102 | if '}rsid' in key.lower(): | ||
| 103 | del item.attrib[key] | ||
| 104 | |||
| 105 | for element in elements_to_remove: | ||
| 106 | parent_map[element].remove(element) | ||
| 107 | |||
| 108 | tree.write(full_path, xml_declaration=True) | ||
| 109 | |||
| 110 | return True | ||
| 111 | |||
| 112 | @staticmethod | ||
| 68 | def __remove_revisions(full_path: str) -> bool: | 113 | def __remove_revisions(full_path: str) -> bool: |
| 69 | """ In this function, we're changing the XML document in several | 114 | """ In this function, we're changing the XML document in several |
| 70 | different times, since we don't want to change the tree we're currently | 115 | different times, since we don't want to change the tree we're currently |
| @@ -112,7 +157,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser): | |||
| 112 | 157 | ||
| 113 | if full_path.endswith('/word/document.xml'): | 158 | if full_path.endswith('/word/document.xml'): |
| 114 | # this file contains the revisions | 159 | # this file contains the revisions |
| 115 | return self.__remove_revisions(full_path) | 160 | if self.__remove_revisions(full_path) is False: |
| 161 | return False | ||
| 162 | |||
| 163 | if full_path.endswith('.xml'): | ||
| 164 | if self.__remove_rsid(full_path) is False: | ||
| 165 | return False | ||
| 166 | |||
| 116 | return True | 167 | return True |
| 117 | 168 | ||
| 118 | def get_meta(self) -> Dict[str, str]: | 169 | def get_meta(self) -> Dict[str, str]: |
diff --git a/tests/data/office_revision_session_ids.docx b/tests/data/office_revision_session_ids.docx new file mode 100644 index 0000000..b40a341 --- /dev/null +++ b/tests/data/office_revision_session_ids.docx | |||
| Binary files differ | |||
diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index 3d1c8e1..82579a3 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py | |||
| @@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase): | |||
| 105 | 105 | ||
| 106 | os.remove('./tests/data/clean.odt') | 106 | os.remove('./tests/data/clean.odt') |
| 107 | os.remove('./tests/data/clean.cleaned.odt') | 107 | os.remove('./tests/data/clean.cleaned.odt') |
| 108 | |||
| 109 | class TestRsidRemoval(unittest.TestCase): | ||
| 110 | def test_office(self): | ||
| 111 | shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx') | ||
| 112 | p = office.MSOfficeParser('./tests/data/clean.docx') | ||
| 113 | |||
| 114 | meta = p.get_meta() | ||
| 115 | self.assertIsNotNone(meta) | ||
| 116 | |||
| 117 | how_many_rsid = False | ||
| 118 | with zipfile.ZipFile('./tests/data/clean.docx') as zin: | ||
| 119 | for item in zin.infolist(): | ||
| 120 | if not item.filename.endswith('.xml'): | ||
| 121 | continue | ||
| 122 | num = zin.read(item).decode('utf-8').lower().count('w:rsid') | ||
| 123 | how_many_rsid += num | ||
| 124 | self.assertEqual(how_many_rsid, 11) | ||
| 125 | |||
| 126 | ret = p.remove_all() | ||
| 127 | self.assertTrue(ret) | ||
| 128 | |||
| 129 | with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin: | ||
| 130 | for item in zin.infolist(): | ||
| 131 | if not item.filename.endswith('.xml'): | ||
| 132 | continue | ||
| 133 | num = zin.read(item).decode('utf-8').lower().count('w:rsid') | ||
| 134 | self.assertEqual(num, 0) | ||
| 135 | |||
| 136 | os.remove('./tests/data/clean.docx') | ||
| 137 | os.remove('./tests/data/clean.cleaned.docx') | ||
| 138 | |||
