From e342671eadd3f5ff922fe62cae81792d4cd65e83 Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Sun, 30 Sep 2018 19:52:35 +0200
Subject: Remove dangling references in MS Office's [Content_types].xml

---
 libmat2/office.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'libmat2/office.py')

diff --git a/libmat2/office.py b/libmat2/office.py
index bad352b..b220092 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -151,10 +151,44 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
 
         return True
 
+    def __remove_content_type_members(self, full_path: str) -> bool:
+        """ The method will remove the dangling references
+        form the [Content_Types].xml file, since MS office doesn't like them
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError:  # pragma: no cover
+            return False
+
+        if len(namespace.items()) != 1:
+            return False  # there should be only one namespace for Types
+
+        removed_fnames = set()
+        with zipfile.ZipFile(self.filename) as zin:
+            for fname in [item.filename for item in zin.infolist()]:
+                if any(map(lambda r: r.search(fname), self.files_to_omit)):
+                    removed_fnames.add(fname)
+
+        root = tree.getroot()
+        for item in root.findall('{%s}Override' % namespace['']):
+            name = item.attrib['PartName'][1:]  # remove the leading '/'
+            if name in removed_fnames:
+                root.remove(item)
+
+        tree.write(full_path, xml_declaration=True)
+
+        return True
+
     def _specific_cleanup(self, full_path: str) -> bool:
         if os.stat(full_path).st_size == 0:  # Don't process empty files
             return True
 
+        if full_path.endswith('/[Content_Types].xml'):
+            # this file contains references to files that we might
+            # remove, and MS Office doesn't like dangling references
+            if self.__remove_content_type_members(full_path) is False:
+                return False
+
         if full_path.endswith('/word/document.xml'):
             # this file contains the revisions
             if self.__remove_revisions(full_path) is False:
-- 
cgit v1.3