From e3d817f57e77676c30fbfa05ed08deee7918b238 Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Thu, 6 Sep 2018 11:32:45 +0200
Subject: Split office and archives

---
 libmat2/office.py | 118 +-----------------------------------------------------
 1 file changed, 1 insertion(+), 117 deletions(-)

(limited to 'libmat2/office.py')

diff --git a/libmat2/office.py b/libmat2/office.py
index 60c5478..50b776e 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -1,15 +1,11 @@
 import os
 import re
-import shutil
-import tempfile
-import datetime
 import zipfile
-import logging
 from typing import Dict, Set, Pattern
 
 import xml.etree.ElementTree as ET  # type: ignore
 
-from . import abstract, parser_factory, UnknownMemberPolicy
+from .archive import ArchiveBasedAbstractParser
 
 # Make pyflakes happy
 assert Set
@@ -26,118 +22,6 @@ def _parse_xml(full_path: str):
     return ET.parse(full_path), namespace_map
 
 
-class ArchiveBasedAbstractParser(abstract.AbstractParser):
-    """ Office files (.docx, .odt, …) are zipped files. """
-    # Those are the files that have a format that _isn't_
-    # supported by MAT2, but that we want to keep anyway.
-    files_to_keep = set()  # type: Set[str]
-
-    # Those are the files that we _do not_ want to keep,
-    # no matter if they are supported or not.
-    files_to_omit = set() # type: Set[Pattern]
-
-    # what should the parser do if it encounters an unknown file in
-    # the archive?
-    unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
-
-    def __init__(self, filename):
-        super().__init__(filename)
-        try:  # better fail here than later
-            zipfile.ZipFile(self.filename)
-        except zipfile.BadZipFile:
-            raise ValueError
-
-    def _specific_cleanup(self, full_path: str) -> bool:
-        """ This method can be used to apply specific treatment
-        to files present in the archive."""
-        # pylint: disable=unused-argument,no-self-use
-        return True  # pragma: no cover
-
-    @staticmethod
-    def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
-        zipinfo.create_system = 3  # Linux
-        zipinfo.comment = b''
-        zipinfo.date_time = (1980, 1, 1, 0, 0, 0)  # this is as early as a zipfile can be
-        return zipinfo
-
-    @staticmethod
-    def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
-        metadata = {}
-        if zipinfo.create_system == 3:  # this is Linux
-            pass
-        elif zipinfo.create_system == 2:
-            metadata['create_system'] = 'Windows'
-        else:
-            metadata['create_system'] = 'Weird'
-
-        if zipinfo.comment:
-            metadata['comment'] = zipinfo.comment  # type: ignore
-
-        if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
-            metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
-
-        return metadata
-
-    def remove_all(self) -> bool:
-        # pylint: disable=too-many-branches
-
-        with zipfile.ZipFile(self.filename) as zin,\
-             zipfile.ZipFile(self.output_filename, 'w') as zout:
-
-            temp_folder = tempfile.mkdtemp()
-            abort = False
-
-            for item in zin.infolist():
-                if item.filename[-1] == '/':  # `is_dir` is added in Python3.6
-                    continue  # don't keep empty folders
-
-                zin.extract(member=item, path=temp_folder)
-                full_path = os.path.join(temp_folder, item.filename)
-
-                if self._specific_cleanup(full_path) is False:
-                    logging.warning("Something went wrong during deep cleaning of %s",
-                                    item.filename)
-                    abort = True
-                    continue
-
-                if item.filename in self.files_to_keep:
-                    # those files aren't supported, but we want to add them anyway
-                    pass
-                elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
-                    continue
-                else:
-                    # supported files that we want to clean then add
-                    tmp_parser, mtype = parser_factory.get_parser(full_path)  # type: ignore
-                    if not tmp_parser:
-                        if self.unknown_member_policy == UnknownMemberPolicy.OMIT:
-                            logging.warning("In file %s, omitting unknown element %s (format: %s)",
-                                            self.filename, item.filename, mtype)
-                            continue
-                        elif self.unknown_member_policy == UnknownMemberPolicy.KEEP:
-                            logging.warning("In file %s, keeping unknown element %s (format: %s)",
-                                            self.filename, item.filename, mtype)
-                        else:
-                            logging.error("In file %s, element %s's format (%s) " +
-                                          "isn't supported",
-                                          self.filename, item.filename, mtype)
-                            abort = True
-                            continue
-                    if tmp_parser:
-                        tmp_parser.remove_all()
-                        os.rename(tmp_parser.output_filename, full_path)
-
-                zinfo = zipfile.ZipInfo(item.filename)  # type: ignore
-                clean_zinfo = self._clean_zipinfo(zinfo)
-                with open(full_path, 'rb') as f:
-                    zout.writestr(clean_zinfo, f.read())
-
-        shutil.rmtree(temp_folder)
-        if abort:
-            os.remove(self.output_filename)
-            return False
-        return True
-
-
 class MSOfficeParser(ArchiveBasedAbstractParser):
     mimetypes = {
         'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-- 
cgit v1.3