diff options
| author | Jason Smalls | 2023-07-11 21:36:52 +0200 |
|---|---|---|
| committer | jvoisin | 2023-07-11 21:38:22 +0200 |
| commit | 8c26020f67b45bc865fe1e2d645af9d99110ac41 (patch) | |
| tree | bb4200d0dc006837fee6c8ddbe4988266ca9ff8d /libmat2 | |
| parent | a0c97b25c46af07408ae1fd364090077fe600538 (diff) | |
Add more files to ignore for MSOffice documents
Diffstat (limited to '')
| -rw-r--r-- | libmat2/office.py | 32 |
1 files changed, 28 insertions, 4 deletions
diff --git a/libmat2/office.py b/libmat2/office.py index 16b20c9..74cc76a 100644 --- a/libmat2/office.py +++ b/libmat2/office.py | |||
| @@ -63,8 +63,20 @@ class MSOfficeParser(ZipParser): | |||
| 63 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml | 63 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml |
| 64 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml | 64 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml |
| 65 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml | 65 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml |
| 66 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml (used for bullet point formatting) | ||
| 67 | 'application/vnd.openxmlformats-officedocument.theme+xml', # /word/theme/theme[0-9].xml (used for font and background coloring, etc.) | ||
| 66 | 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml | 68 | 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml |
| 67 | 69 | ||
| 70 | # for more complicated powerpoints | ||
| 71 | 'application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml', | ||
| 72 | 'application/vnd.openxmlformats-officedocument.presentationml.notesMaster+xml', | ||
| 73 | 'application/vnd.openxmlformats-officedocument.presentationml.handoutMaster+xml', | ||
| 74 | 'application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml', | ||
| 75 | 'application/vnd.openxmlformats-officedocument.drawingml.diagramLayout+xml', | ||
| 76 | 'application/vnd.openxmlformats-officedocument.drawingml.diagramStyle+xml', | ||
| 77 | 'application/vnd.openxmlformats-officedocument.drawingml.diagramColors+xml', | ||
| 78 | 'application/vnd.ms-office.drawingml.diagramDrawing+xml', | ||
| 79 | |||
| 68 | # Do we want to keep the following ones? | 80 | # Do we want to keep the following ones? |
| 69 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', | 81 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', |
| 70 | } | 82 | } |
| @@ -85,7 +97,7 @@ class MSOfficeParser(ZipParser): | |||
| 85 | r'^_rels/\.rels$', | 97 | r'^_rels/\.rels$', |
| 86 | r'^xl/sharedStrings\.xml$', # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table | 98 | r'^xl/sharedStrings\.xml$', # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table |
| 87 | r'^xl/calcChain\.xml$', | 99 | r'^xl/calcChain\.xml$', |
| 88 | r'^(?:word|ppt|xl)/_rels/document\.xml\.rels$', | 100 | r'^(?:word|ppt|xl)/_rels/(document|workbook|presentation)\.xml\.rels$', |
| 89 | r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$', | 101 | r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$', |
| 90 | r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$', | 102 | r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$', |
| 91 | r'^(?:word|ppt|xl)/charts/_rels/chart[0-9]+\.xml\.rels$', | 103 | r'^(?:word|ppt|xl)/charts/_rels/chart[0-9]+\.xml\.rels$', |
| @@ -100,6 +112,7 @@ class MSOfficeParser(ZipParser): | |||
| 100 | r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$', | 112 | r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$', |
| 101 | r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$', | 113 | r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$', |
| 102 | r'^(?:word|ppt|xl)/tableStyles\.xml$', | 114 | r'^(?:word|ppt|xl)/tableStyles\.xml$', |
| 115 | r'^(?:word|ppt|xl)/tables/table[0-9]+\.xml$', | ||
| 103 | r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$', | 116 | r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$', |
| 104 | r'^ppt/slides/slide[0-9]*\.xml$', | 117 | r'^ppt/slides/slide[0-9]*\.xml$', |
| 105 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx | 118 | # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx |
| @@ -109,8 +122,13 @@ class MSOfficeParser(ZipParser): | |||
| 109 | r'^ppt/slideMasters/slideMaster[0-9]+\.xml', | 122 | r'^ppt/slideMasters/slideMaster[0-9]+\.xml', |
| 110 | r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels', | 123 | r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels', |
| 111 | r'^xl/worksheets/_rels/sheet[0-9]+\.xml\.rels', | 124 | r'^xl/worksheets/_rels/sheet[0-9]+\.xml\.rels', |
| 112 | r'^xl/drawings/vmlDrawing[0-9]+\.vml', | 125 | r'^(?:word|ppt|xl)/drawings/vmlDrawing[0-9]+\.vml', |
| 113 | r'^xl/drawings/drawing[0-9]+\.xml', | 126 | r'^(?:word|ppt|xl)/drawings/drawing[0-9]+\.xml', |
| 127 | r'^(?:word|ppt|xl)/embeddings/Microsoft_Excel_Worksheet[0-9]+\.xlsx', | ||
| 128 | # rels for complicated powerpoints | ||
| 129 | r'^ppt/notesSlides/_rels/notesSlide[0-9]+\.xml\.rels', | ||
| 130 | r'^ppt/notesMasters/_rels/notesMaster[0-9]+\.xml\.rels', | ||
| 131 | r'^ppt/handoutMasters/_rels/handoutMaster[0-9]+\.xml\.rels', | ||
| 114 | })) | 132 | })) |
| 115 | self.files_to_omit = set(map(re.compile, { # type: ignore | 133 | self.files_to_omit = set(map(re.compile, { # type: ignore |
| 116 | r'^\[trash\]/', | 134 | r'^\[trash\]/', |
| @@ -120,18 +138,24 @@ class MSOfficeParser(ZipParser): | |||
| 120 | r'^(?:word|ppt|xl)/printerSettings/', | 138 | r'^(?:word|ppt|xl)/printerSettings/', |
| 121 | r'^(?:word|ppt|xl)/theme', | 139 | r'^(?:word|ppt|xl)/theme', |
| 122 | r'^(?:word|ppt|xl)/people\.xml$', | 140 | r'^(?:word|ppt|xl)/people\.xml$', |
| 141 | r'^(?:word|ppt|xl)/persons/person\.xml$', | ||
| 123 | r'^(?:word|ppt|xl)/numbering\.xml$', | 142 | r'^(?:word|ppt|xl)/numbering\.xml$', |
| 124 | r'^(?:word|ppt|xl)/tags/', | 143 | r'^(?:word|ppt|xl)/tags/', |
| 144 | r'^(?:word|ppt|xl)/glossary/', | ||
| 125 | # View properties like view mode, last viewed slide etc | 145 | # View properties like view mode, last viewed slide etc |
| 126 | r'^(?:word|ppt|xl)/viewProps\.xml$', | 146 | r'^(?:word|ppt|xl)/viewProps\.xml$', |
| 127 | # Additional presentation-wide properties like printing properties, | 147 | # Additional presentation-wide properties like printing properties, |
| 128 | # presentation show properties etc. | 148 | # presentation show properties etc. |
| 129 | r'^(?:word|ppt|xl)/presProps\.xml$', | 149 | r'^(?:word|ppt|xl)/presProps\.xml$', |
| 130 | r'^(?:word|ppt|xl)/comments[0-9]+\.xml$', | 150 | r'^(?:word|ppt|xl)/comments[0-9]+\.xml$', |
| 131 | 151 | r'^(?:word|ppt|xl)/threadedComments/threadedComment[0-9]*\.xml$', | |
| 152 | r'^(?:word|ppt|xl)/commentsExtended\.xml$', | ||
| 153 | r'^(?:word|ppt|xl)/commentsExtensible\.xml$', | ||
| 154 | r'^(?:word|ppt|xl)/commentsIds\.xml$', | ||
| 132 | # we have an allowlist in self.files_to_keep, | 155 | # we have an allowlist in self.files_to_keep, |
| 133 | # so we can trash everything else | 156 | # so we can trash everything else |
| 134 | r'^(?:word|ppt|xl)/_rels/', | 157 | r'^(?:word|ppt|xl)/_rels/', |
| 158 | r'docMetadata/LabelInfo\.xml$' | ||
| 135 | })) | 159 | })) |
| 136 | 160 | ||
| 137 | if self.__fill_files_to_keep_via_content_types() is False: | 161 | if self.__fill_files_to_keep_via_content_types() is False: |
