summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorjvoisin2019-09-01 13:07:56 +0200
committerjvoisin2019-09-01 13:52:02 +0200
commit0170f0e37ec9fefd1ac2829a070b76b91c999b92 (patch)
tree688d92920c7aa1d27d54c326fe1e302dd85b3389 /libmat2/office.py
parent0cf0541ad9c2f40aa987cb34be34bc33b7341232 (diff)
Improve a bit the comments in the code
This is related to the previous commit
Diffstat (limited to '')
-rw-r--r--libmat2/office.py25
1 files changed, 10 insertions, 15 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index c9bed7a..52bf7c5 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -44,6 +44,12 @@ def _sort_xml_attributes(full_path: str) -> bool:
44 44
45 45
46class MSOfficeParser(ZipParser): 46class MSOfficeParser(ZipParser):
47 """
48 The methods modifying XML documents are usually doing so in two loops:
49 1. finding the tag/attributes to remove;
50 2. actually editing the document
51 since it's tricky to modify the XML while iterating on it.
52 """
47 mimetypes = { 53 mimetypes = {
48 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 54 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
49 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 55 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
@@ -126,9 +132,6 @@ class MSOfficeParser(ZipParser):
126 instead of proper parsing, since rsid can have multiple forms, like 132 instead of proper parsing, since rsid can have multiple forms, like
127 `rsidRDefault`, `rsidR`, `rsids`, … 133 `rsidRDefault`, `rsidR`, `rsids`, …
128 134
129 We're removing rsid tags in two times, because we can't modify
130 the xml while we're iterating on it.
131
132 For more details, see 135 For more details, see
133 - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx 136 - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
134 - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/ 137 - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
@@ -163,15 +166,11 @@ class MSOfficeParser(ZipParser):
163 @staticmethod 166 @staticmethod
164 def __remove_nsid(full_path: str) -> bool: 167 def __remove_nsid(full_path: str) -> bool:
165 """ 168 """
166 NSID are random identifiers that can be used 169 nsid are random identifiers that can be used to ease the merging of
167 to ease the merging of some components of a document. 170 some components of a document. They can also be used for
168 They can also be used for fingerprinting. 171 fingerprinting.
169 172
170 See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1 173 See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1
171
172 In this function, we're changing the XML document in several
173 different times, since we don't want to change the tree we're currently
174 iterating on.
175 """ 174 """
176 try: 175 try:
177 tree, namespace = _parse_xml(full_path) 176 tree, namespace = _parse_xml(full_path)
@@ -179,7 +178,7 @@ class MSOfficeParser(ZipParser):
179 logging.error("Unable to parse %s: %s", full_path, e) 178 logging.error("Unable to parse %s: %s", full_path, e)
180 return False 179 return False
181 180
182 # The NSID tag is always under the `w` namespace 181 # The nsid tag is always under the `w` namespace
183 if 'w' not in namespace.keys(): 182 if 'w' not in namespace.keys():
184 return True 183 return True
185 184
@@ -197,10 +196,6 @@ class MSOfficeParser(ZipParser):
197 196
198 @staticmethod 197 @staticmethod
199 def __remove_revisions(full_path: str) -> bool: 198 def __remove_revisions(full_path: str) -> bool:
200 """ In this function, we're changing the XML document in several
201 different times, since we don't want to change the tree we're currently
202 iterating on.
203 """
204 try: 199 try:
205 tree, namespace = _parse_xml(full_path) 200 tree, namespace = _parse_xml(full_path)
206 except ET.ParseError as e: 201 except ET.ParseError as e: