From 942859601d5d08f05b374d1f12270192cede1155 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Thu, 19 Jul 2018 23:10:27 +0200 Subject: Improve the code's documentation --- libmat2/office.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'libmat2/office.py') diff --git a/libmat2/office.py b/libmat2/office.py index c6c4688..62d0395 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -33,6 +33,7 @@ def _parse_xml(full_path: str): class ArchiveBasedAbstractParser(abstract.AbstractParser): + """ Office files (.docx, .odt, …) are zipped files. """ # Those are the files that have a format that _isn't_ # supported by MAT2, but that we want to keep anyway. files_to_keep = set() # type: Set[str] @@ -58,14 +59,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: zipinfo.create_system = 3 # Linux zipinfo.comment = b'' - zipinfo.date_time = (1980, 1, 1, 0, 0, 0) + zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be return zipinfo @staticmethod def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: metadata = {} - if zipinfo.create_system == 3: - #metadata['create_system'] = 'Linux' + if zipinfo.create_system == 3: # this is Linux pass elif zipinfo.create_system == 2: metadata['create_system'] = 'Windows' @@ -145,23 +145,27 @@ class MSOfficeParser(ArchiveBasedAbstractParser): @staticmethod def __remove_revisions(full_path: str) -> bool: - """ In this function, we're changing the XML - document in two times, since we don't want - to change the tree we're iterating on.""" + """ In this function, we're changing the XML document in several + different times, since we don't want to change the tree we're currently + iterating on. + """ try: tree, namespace = _parse_xml(full_path) except ET.ParseError: return False - # No revisions are present + # Revisions are either deletions (`w:del`) or + # insertions (`w:ins`) del_presence = tree.find('.//w:del', namespace) ins_presence = tree.find('.//w:ins', namespace) if del_presence is None and ins_presence is None: - return True + return True # No revisions are present parent_map = {c:p for p in tree.iter() for c in p} - elements = list([element for element in tree.iterfind('.//w:del', namespace)]) + elements = list() + for element in tree.iterfind('.//w:del', namespace): + elements.append(element) for element in elements: parent_map[element].remove(element) @@ -172,7 +176,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser): for children in element.iterfind('./*'): elements.append((element, position, children)) break - for (element, position, children) in elements: parent_map[element].insert(position, children) parent_map[element].remove(element) @@ -183,6 +186,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser): def _specific_cleanup(self, full_path: str) -> bool: if full_path.endswith('/word/document.xml'): + # this file contains the revisions return self.__remove_revisions(full_path) return True -- cgit v1.3