summaryrefslogtreecommitdiff
path: root/libmat2
diff options
context:
space:
mode:
authorjvoisin2018-09-20 22:37:53 +0200
committerjvoisin2018-09-24 18:03:59 +0200
commit174d4a0ac09c2e9d4a9aa3677a442c05459b8309 (patch)
tree78b680d8e2bb8f22ae92071ba2d8a5c3da0eef64 /libmat2
parentfbcf68c280643bce8f6451cc84db2910755df5a8 (diff)
Implement rsid stripping for office files
MS Office XML rsid is a "unique identifier used to track the editing session when the physical character representing this section mark was last formatted." See the following links for details: - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/.
Diffstat (limited to 'libmat2')
-rw-r--r--libmat2/office.py61
1 files changed, 56 insertions, 5 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 5c2c996..07bbbb9 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET # type: ignore
8 8
9from .archive import ArchiveBasedAbstractParser 9from .archive import ArchiveBasedAbstractParser
10 10
11# pylint: disable=line-too-long
12
11# Make pyflakes happy 13# Make pyflakes happy
12assert Set 14assert Set
13assert Pattern 15assert Pattern
@@ -15,14 +17,12 @@ assert Pattern
15def _parse_xml(full_path: str): 17def _parse_xml(full_path: str):
16 """ This function parses XML, with namespace support. """ 18 """ This function parses XML, with namespace support. """
17 19
18 cpt = 0
19 namespace_map = dict() 20 namespace_map = dict()
20 for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): 21 for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
21 # The ns[0-9]+ namespaces are reserved for interal usage, so 22 # The ns[0-9]+ namespaces are reserved for interal usage, so
22 # we have to use an other nomenclature. 23 # we have to use an other nomenclature.
23 if re.match('^ns[0-9]+$', key): 24 if re.match('^ns[0-9]+$', key, re.I): #pragma: no cover
24 key = 'mat%d' % cpt 25 key = 'mat' + key[2:]
25 cpt += 1
26 26
27 namespace_map[key] = value 27 namespace_map[key] = value
28 ET.register_namespace(key, value) 28 ET.register_namespace(key, value)
@@ -59,12 +59,57 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
59 'word/fontTable.xml', 59 'word/fontTable.xml',
60 'word/settings.xml', 60 'word/settings.xml',
61 'word/styles.xml', 61 'word/styles.xml',
62
63 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
64 'word/stylesWithEffects.xml',
62 } 65 }
63 files_to_omit = set(map(re.compile, { # type: ignore 66 files_to_omit = set(map(re.compile, { # type: ignore
67 'word/webSettings.xml',
68 'word/theme',
64 '^docProps/', 69 '^docProps/',
65 })) 70 }))
66 71
67 @staticmethod 72 @staticmethod
73 def __remove_rsid(full_path: str) -> bool:
74 """ The method will remove "revision session ID". We're '}rsid'
75 instead of proper parsing, since rsid can have multiple forms, like
76 `rsidRDefault`, `rsidR`, `rsids`, …
77
78 We're removing rsid tags in two times, because we can't modify
79 the xml while we're iterating on it.
80
81 For more details, see
82 - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
83 - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
84 """
85 try:
86 tree, namespace = _parse_xml(full_path)
87 except ET.ParseError:
88 return False
89
90 # rsid, tags or attributes, are always under the `w` namespace
91 if 'w' not in namespace.keys():
92 return True
93
94 parent_map = {c:p for p in tree.iter() for c in p}
95
96 elements_to_remove = list()
97 for item in tree.iterfind('.//', namespace):
98 if '}rsid' in item.tag.strip().lower(): # resi as tag
99 elements_to_remove.append(item)
100 continue
101 for key in list(item.attrib.keys()): # rsid as attribute
102 if '}rsid' in key.lower():
103 del item.attrib[key]
104
105 for element in elements_to_remove:
106 parent_map[element].remove(element)
107
108 tree.write(full_path, xml_declaration=True)
109
110 return True
111
112 @staticmethod
68 def __remove_revisions(full_path: str) -> bool: 113 def __remove_revisions(full_path: str) -> bool:
69 """ In this function, we're changing the XML document in several 114 """ In this function, we're changing the XML document in several
70 different times, since we don't want to change the tree we're currently 115 different times, since we don't want to change the tree we're currently
@@ -112,7 +157,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
112 157
113 if full_path.endswith('/word/document.xml'): 158 if full_path.endswith('/word/document.xml'):
114 # this file contains the revisions 159 # this file contains the revisions
115 return self.__remove_revisions(full_path) 160 if self.__remove_revisions(full_path) is False:
161 return False
162
163 if full_path.endswith('.xml'):
164 if self.__remove_rsid(full_path) is False:
165 return False
166
116 return True 167 return True
117 168
118 def get_meta(self) -> Dict[str, str]: 169 def get_meta(self) -> Dict[str, str]: