From 38fae60b8beaf9c7b37c65325d2d285e62b6cb85 Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Fri, 18 May 2018 23:52:40 +0200
Subject: Rename some files to simplify packaging

- the `src` folder is now `libmat2`
- the `main.py` script is now `mat2.py`
---
 libmat2/office.py | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 libmat2/office.py

(limited to 'libmat2/office.py')

diff --git a/libmat2/office.py b/libmat2/office.py
new file mode 100644
index 0000000..749fc7d
--- /dev/null
+++ b/libmat2/office.py
@@ -0,0 +1,150 @@
+import os
+import re
+import shutil
+import tempfile
+import datetime
+import zipfile
+
+from . import abstract, parser_factory
+
+
+class ArchiveBasedAbstractParser(abstract.AbstractParser):
+    def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
+        zipinfo.compress_type = zipfile.ZIP_DEFLATED
+        zipinfo.create_system = 3  # Linux
+        zipinfo.comment = b''
+        zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
+        return zipinfo
+
+    def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict:
+        metadata = {}
+        if zipinfo.create_system == 3:
+            #metadata['create_system'] = 'Linux'
+            pass
+        elif zipinfo.create_system == 2:
+            metadata['create_system'] = 'Windows'
+        else:
+            metadata['create_system'] = 'Weird'
+
+        if zipinfo.comment:
+            metadata['comment'] = zipinfo.comment
+
+        if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
+            metadata['date_time'] = datetime.datetime(*zipinfo.date_time)
+
+        return metadata
+
+
+    def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
+                             zin: zipfile.ZipFile, zout: zipfile.ZipFile):
+        zin.extract(member=item, path=temp_folder)
+        tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename))
+        if not tmp_parser:
+            print("%s's format (%s) isn't supported" % (item.filename, mtype))
+            return
+        tmp_parser.remove_all()
+        zinfo = zipfile.ZipInfo(item.filename)
+        clean_zinfo = self._clean_zipinfo(zinfo)
+        with open(tmp_parser.output_filename, 'rb') as f:
+            zout.writestr(clean_zinfo, f.read())
+
+
+class MSOfficeParser(ArchiveBasedAbstractParser):
+    mimetypes = {
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+        'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+    }
+    files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
+
+    def get_meta(self):
+        """
+        Yes, I know that parsing xml with regexp ain't pretty,
+        be my guest and fix it if you want.
+        """
+        metadata = {}
+        zipin = zipfile.ZipFile(self.filename)
+        for item in zipin.infolist():
+            if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
+                content = zipin.read(item).decode('utf-8')
+                for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
+                    metadata[key] = value
+                if not metadata:  # better safe than sorry
+                    metadata[item] = 'harmful content'
+
+            metadata = {**metadata, **self._get_zipinfo_meta(item)}
+        zipin.close()
+        return metadata
+
+
+    def remove_all(self):
+        zin = zipfile.ZipFile(self.filename, 'r')
+        zout = zipfile.ZipFile(self.output_filename, 'w')
+        temp_folder = tempfile.mkdtemp()
+
+        for item in zin.infolist():
+            if item.filename[-1] == '/':
+                continue  # `is_dir` is added in Python3.6
+            elif item.filename.startswith('docProps/'):
+                if not item.filename.endswith('.rels'):
+                    continue  # don't keep metadata files
+            if item.filename in self.files_to_keep:
+                item = self._clean_zipinfo(item)
+                zout.writestr(item, zin.read(item))
+                continue
+
+            self._clean_internal_file(item, temp_folder, zin, zout)
+
+        shutil.rmtree(temp_folder)
+        zout.close()
+        zin.close()
+        return True
+
+
+
+class LibreOfficeParser(ArchiveBasedAbstractParser):
+    mimetypes = {
+        'application/vnd.oasis.opendocument.text',
+        'application/vnd.oasis.opendocument.spreadsheet',
+        'application/vnd.oasis.opendocument.presentation',
+        'application/vnd.oasis.opendocument.graphics',
+        'application/vnd.oasis.opendocument.chart',
+        'application/vnd.oasis.opendocument.formula',
+        'application/vnd.oasis.opendocument.image',
+    }
+
+    def get_meta(self):
+        """
+        Yes, I know that parsing xml with regexp ain't pretty,
+        be my guest and fix it if you want.
+        """
+        metadata = {}
+        zipin = zipfile.ZipFile(self.filename)
+        for item in zipin.infolist():
+            if item.filename == 'meta.xml':
+                content = zipin.read(item).decode('utf-8')
+                for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I):
+                    metadata[key] = value
+                if not metadata:  # better safe than sorry
+                    metadata[item] = 'harmful content'
+            metadata = {**metadata, **self._get_zipinfo_meta(item)}
+        zipin.close()
+        return metadata
+
+    def remove_all(self):
+        zin = zipfile.ZipFile(self.filename, 'r')
+        zout = zipfile.ZipFile(self.output_filename, 'w')
+        temp_folder = tempfile.mkdtemp()
+
+        for item in zin.infolist():
+            if item.filename[-1] == '/':
+                continue  # `is_dir` is added in Python3.6
+            elif item.filename == 'meta.xml':
+                continue  # don't keep metadata files
+
+            self._clean_internal_file(item, temp_folder, zin, zout)
+
+        shutil.rmtree(temp_folder)
+        zout.close()
+        zin.close()
+        return True
-- 
cgit v1.3