diff options
| author | jvoisin | 2019-09-01 13:07:56 +0200 |
|---|---|---|
| committer | jvoisin | 2019-09-01 13:52:02 +0200 |
| commit | 0170f0e37ec9fefd1ac2829a070b76b91c999b92 (patch) | |
| tree | 688d92920c7aa1d27d54c326fe1e302dd85b3389 /libmat2/office.py | |
| parent | 0cf0541ad9c2f40aa987cb34be34bc33b7341232 (diff) | |
Improve a bit the comments in the code
This is related to the previous commit
Diffstat (limited to 'libmat2/office.py')
| -rw-r--r-- | libmat2/office.py | 25 |
1 files changed, 10 insertions, 15 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index c9bed7a..52bf7c5 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -44,6 +44,12 @@ def _sort_xml_attributes(full_path: str) -> bool: | |||
| 44 | 44 | ||
| 45 | 45 | ||
| 46 | class MSOfficeParser(ZipParser): | 46 | class MSOfficeParser(ZipParser): |
| 47 | """ | ||
| 48 | The methods modifying XML documents are usually doing so in two loops: | ||
| 49 | 1. finding the tag/attributes to remove; | ||
| 50 | 2. actually editing the document | ||
| 51 | since it's tricky to modify the XML while iterating on it. | ||
| 52 | """ | ||
| 47 | mimetypes = { | 53 | mimetypes = { |
| 48 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | 54 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
| 49 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | 55 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
| @@ -126,9 +132,6 @@ class MSOfficeParser(ZipParser): | |||
| 126 | instead of proper parsing, since rsid can have multiple forms, like | 132 | instead of proper parsing, since rsid can have multiple forms, like |
| 127 | `rsidRDefault`, `rsidR`, `rsids`, … | 133 | `rsidRDefault`, `rsidR`, `rsids`, … |
| 128 | 134 | ||
| 129 | We're removing rsid tags in two times, because we can't modify | ||
| 130 | the xml while we're iterating on it. | ||
| 131 | |||
| 132 | For more details, see | 135 | For more details, see |
| 133 | - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx | 136 | - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx |
| 134 | - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/ | 137 | - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/ |
| @@ -163,15 +166,11 @@ class MSOfficeParser(ZipParser): | |||
| 163 | @staticmethod | 166 | @staticmethod |
| 164 | def __remove_nsid(full_path: str) -> bool: | 167 | def __remove_nsid(full_path: str) -> bool: |
| 165 | """ | 168 | """ |
| 166 | NSID are random identifiers that can be used | 169 | nsid are random identifiers that can be used to ease the merging of |
| 167 | to ease the merging of some components of a document. | 170 | some components of a document. They can also be used for |
| 168 | They can also be used for fingerprinting. | 171 | fingerprinting. |
| 169 | 172 | ||
| 170 | See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1 | 173 | See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1 |
| 171 | |||
| 172 | In this function, we're changing the XML document in several | ||
| 173 | different times, since we don't want to change the tree we're currently | ||
| 174 | iterating on. | ||
| 175 | """ | 174 | """ |
| 176 | try: | 175 | try: |
| 177 | tree, namespace = _parse_xml(full_path) | 176 | tree, namespace = _parse_xml(full_path) |
| @@ -179,7 +178,7 @@ class MSOfficeParser(ZipParser): | |||
| 179 | logging.error("Unable to parse %s: %s", full_path, e) | 178 | logging.error("Unable to parse %s: %s", full_path, e) |
| 180 | return False | 179 | return False |
| 181 | 180 | ||
| 182 | # The NSID tag is always under the `w` namespace | 181 | # The nsid tag is always under the `w` namespace |
| 183 | if 'w' not in namespace.keys(): | 182 | if 'w' not in namespace.keys(): |
| 184 | return True | 183 | return True |
| 185 | 184 | ||
| @@ -197,10 +196,6 @@ class MSOfficeParser(ZipParser): | |||
| 197 | 196 | ||
| 198 | @staticmethod | 197 | @staticmethod |
| 199 | def __remove_revisions(full_path: str) -> bool: | 198 | def __remove_revisions(full_path: str) -> bool: |
| 200 | """ In this function, we're changing the XML document in several | ||
| 201 | different times, since we don't want to change the tree we're currently | ||
| 202 | iterating on. | ||
| 203 | """ | ||
| 204 | try: | 199 | try: |
| 205 | tree, namespace = _parse_xml(full_path) | 200 | tree, namespace = _parse_xml(full_path) |
| 206 | except ET.ParseError as e: | 201 | except ET.ParseError as e: |
