diff options
Diffstat (limited to 'libmat2/office.py')
| -rw-r--r-- | libmat2/office.py | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/libmat2/office.py b/libmat2/office.py new file mode 100644 index 0000000..749fc7d --- /dev/null +++ b/libmat2/office.py | |||
| @@ -0,0 +1,150 @@ | |||
| 1 | import os | ||
| 2 | import re | ||
| 3 | import shutil | ||
| 4 | import tempfile | ||
| 5 | import datetime | ||
| 6 | import zipfile | ||
| 7 | |||
| 8 | from . import abstract, parser_factory | ||
| 9 | |||
| 10 | |||
| 11 | class ArchiveBasedAbstractParser(abstract.AbstractParser): | ||
| 12 | def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: | ||
| 13 | zipinfo.compress_type = zipfile.ZIP_DEFLATED | ||
| 14 | zipinfo.create_system = 3 # Linux | ||
| 15 | zipinfo.comment = b'' | ||
| 16 | zipinfo.date_time = (1980, 1, 1, 0, 0, 0) | ||
| 17 | return zipinfo | ||
| 18 | |||
| 19 | def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict: | ||
| 20 | metadata = {} | ||
| 21 | if zipinfo.create_system == 3: | ||
| 22 | #metadata['create_system'] = 'Linux' | ||
| 23 | pass | ||
| 24 | elif zipinfo.create_system == 2: | ||
| 25 | metadata['create_system'] = 'Windows' | ||
| 26 | else: | ||
| 27 | metadata['create_system'] = 'Weird' | ||
| 28 | |||
| 29 | if zipinfo.comment: | ||
| 30 | metadata['comment'] = zipinfo.comment | ||
| 31 | |||
| 32 | if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): | ||
| 33 | metadata['date_time'] = datetime.datetime(*zipinfo.date_time) | ||
| 34 | |||
| 35 | return metadata | ||
| 36 | |||
| 37 | |||
| 38 | def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, | ||
| 39 | zin: zipfile.ZipFile, zout: zipfile.ZipFile): | ||
| 40 | zin.extract(member=item, path=temp_folder) | ||
| 41 | tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) | ||
| 42 | if not tmp_parser: | ||
| 43 | print("%s's format (%s) isn't supported" % (item.filename, mtype)) | ||
| 44 | return | ||
| 45 | tmp_parser.remove_all() | ||
| 46 | zinfo = zipfile.ZipInfo(item.filename) | ||
| 47 | clean_zinfo = self._clean_zipinfo(zinfo) | ||
| 48 | with open(tmp_parser.output_filename, 'rb') as f: | ||
| 49 | zout.writestr(clean_zinfo, f.read()) | ||
| 50 | |||
| 51 | |||
| 52 | class MSOfficeParser(ArchiveBasedAbstractParser): | ||
| 53 | mimetypes = { | ||
| 54 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | ||
| 55 | 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | ||
| 56 | 'application/vnd.openxmlformats-officedocument.presentationml.presentation' | ||
| 57 | } | ||
| 58 | files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} | ||
| 59 | |||
| 60 | def get_meta(self): | ||
| 61 | """ | ||
| 62 | Yes, I know that parsing xml with regexp ain't pretty, | ||
| 63 | be my guest and fix it if you want. | ||
| 64 | """ | ||
| 65 | metadata = {} | ||
| 66 | zipin = zipfile.ZipFile(self.filename) | ||
| 67 | for item in zipin.infolist(): | ||
| 68 | if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): | ||
| 69 | content = zipin.read(item).decode('utf-8') | ||
| 70 | for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I): | ||
| 71 | metadata[key] = value | ||
| 72 | if not metadata: # better safe than sorry | ||
| 73 | metadata[item] = 'harmful content' | ||
| 74 | |||
| 75 | metadata = {**metadata, **self._get_zipinfo_meta(item)} | ||
| 76 | zipin.close() | ||
| 77 | return metadata | ||
| 78 | |||
| 79 | |||
| 80 | def remove_all(self): | ||
| 81 | zin = zipfile.ZipFile(self.filename, 'r') | ||
| 82 | zout = zipfile.ZipFile(self.output_filename, 'w') | ||
| 83 | temp_folder = tempfile.mkdtemp() | ||
| 84 | |||
| 85 | for item in zin.infolist(): | ||
| 86 | if item.filename[-1] == '/': | ||
| 87 | continue # `is_dir` is added in Python3.6 | ||
| 88 | elif item.filename.startswith('docProps/'): | ||
| 89 | if not item.filename.endswith('.rels'): | ||
| 90 | continue # don't keep metadata files | ||
| 91 | if item.filename in self.files_to_keep: | ||
| 92 | item = self._clean_zipinfo(item) | ||
| 93 | zout.writestr(item, zin.read(item)) | ||
| 94 | continue | ||
| 95 | |||
| 96 | self._clean_internal_file(item, temp_folder, zin, zout) | ||
| 97 | |||
| 98 | shutil.rmtree(temp_folder) | ||
| 99 | zout.close() | ||
| 100 | zin.close() | ||
| 101 | return True | ||
| 102 | |||
| 103 | |||
| 104 | |||
| 105 | class LibreOfficeParser(ArchiveBasedAbstractParser): | ||
| 106 | mimetypes = { | ||
| 107 | 'application/vnd.oasis.opendocument.text', | ||
| 108 | 'application/vnd.oasis.opendocument.spreadsheet', | ||
| 109 | 'application/vnd.oasis.opendocument.presentation', | ||
| 110 | 'application/vnd.oasis.opendocument.graphics', | ||
| 111 | 'application/vnd.oasis.opendocument.chart', | ||
| 112 | 'application/vnd.oasis.opendocument.formula', | ||
| 113 | 'application/vnd.oasis.opendocument.image', | ||
| 114 | } | ||
| 115 | |||
| 116 | def get_meta(self): | ||
| 117 | """ | ||
| 118 | Yes, I know that parsing xml with regexp ain't pretty, | ||
| 119 | be my guest and fix it if you want. | ||
| 120 | """ | ||
| 121 | metadata = {} | ||
| 122 | zipin = zipfile.ZipFile(self.filename) | ||
| 123 | for item in zipin.infolist(): | ||
| 124 | if item.filename == 'meta.xml': | ||
| 125 | content = zipin.read(item).decode('utf-8') | ||
| 126 | for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I): | ||
| 127 | metadata[key] = value | ||
| 128 | if not metadata: # better safe than sorry | ||
| 129 | metadata[item] = 'harmful content' | ||
| 130 | metadata = {**metadata, **self._get_zipinfo_meta(item)} | ||
| 131 | zipin.close() | ||
| 132 | return metadata | ||
| 133 | |||
| 134 | def remove_all(self): | ||
| 135 | zin = zipfile.ZipFile(self.filename, 'r') | ||
| 136 | zout = zipfile.ZipFile(self.output_filename, 'w') | ||
| 137 | temp_folder = tempfile.mkdtemp() | ||
| 138 | |||
| 139 | for item in zin.infolist(): | ||
| 140 | if item.filename[-1] == '/': | ||
| 141 | continue # `is_dir` is added in Python3.6 | ||
| 142 | elif item.filename == 'meta.xml': | ||
| 143 | continue # don't keep metadata files | ||
| 144 | |||
| 145 | self._clean_internal_file(item, temp_folder, zin, zout) | ||
| 146 | |||
| 147 | shutil.rmtree(temp_folder) | ||
| 148 | zout.close() | ||
| 149 | zin.close() | ||
| 150 | return True | ||
