summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason Smalls2023-07-11 21:36:52 +0200
committerjvoisin2023-07-11 21:38:22 +0200
commit8c26020f67b45bc865fe1e2d645af9d99110ac41 (patch)
treebb4200d0dc006837fee6c8ddbe4988266ca9ff8d
parenta0c97b25c46af07408ae1fd364090077fe600538 (diff)
Add more files to ignore for MSOffice documents
-rw-r--r--libmat2/office.py32
1 files changed, 28 insertions, 4 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 16b20c9..74cc76a 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -63,8 +63,20 @@ class MSOfficeParser(ZipParser):
63 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml 63 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml
64 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml 64 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml
65 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml 65 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml
66 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml (used for bullet point formatting)
67 'application/vnd.openxmlformats-officedocument.theme+xml', # /word/theme/theme[0-9].xml (used for font and background coloring, etc.)
66 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml 68 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml
67 69
70 # for more complicated powerpoints
71 'application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml',
72 'application/vnd.openxmlformats-officedocument.presentationml.notesMaster+xml',
73 'application/vnd.openxmlformats-officedocument.presentationml.handoutMaster+xml',
74 'application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml',
75 'application/vnd.openxmlformats-officedocument.drawingml.diagramLayout+xml',
76 'application/vnd.openxmlformats-officedocument.drawingml.diagramStyle+xml',
77 'application/vnd.openxmlformats-officedocument.drawingml.diagramColors+xml',
78 'application/vnd.ms-office.drawingml.diagramDrawing+xml',
79
68 # Do we want to keep the following ones? 80 # Do we want to keep the following ones?
69 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', 81 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
70 } 82 }
@@ -85,7 +97,7 @@ class MSOfficeParser(ZipParser):
85 r'^_rels/\.rels$', 97 r'^_rels/\.rels$',
86 r'^xl/sharedStrings\.xml$', # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table 98 r'^xl/sharedStrings\.xml$', # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table
87 r'^xl/calcChain\.xml$', 99 r'^xl/calcChain\.xml$',
88 r'^(?:word|ppt|xl)/_rels/document\.xml\.rels$', 100 r'^(?:word|ppt|xl)/_rels/(document|workbook|presentation)\.xml\.rels$',
89 r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$', 101 r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$',
90 r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$', 102 r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$',
91 r'^(?:word|ppt|xl)/charts/_rels/chart[0-9]+\.xml\.rels$', 103 r'^(?:word|ppt|xl)/charts/_rels/chart[0-9]+\.xml\.rels$',
@@ -100,6 +112,7 @@ class MSOfficeParser(ZipParser):
100 r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$', 112 r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
101 r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$', 113 r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
102 r'^(?:word|ppt|xl)/tableStyles\.xml$', 114 r'^(?:word|ppt|xl)/tableStyles\.xml$',
115 r'^(?:word|ppt|xl)/tables/table[0-9]+\.xml$',
103 r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$', 116 r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
104 r'^ppt/slides/slide[0-9]*\.xml$', 117 r'^ppt/slides/slide[0-9]*\.xml$',
105 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx 118 # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
@@ -109,8 +122,13 @@ class MSOfficeParser(ZipParser):
109 r'^ppt/slideMasters/slideMaster[0-9]+\.xml', 122 r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
110 r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels', 123 r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
111 r'^xl/worksheets/_rels/sheet[0-9]+\.xml\.rels', 124 r'^xl/worksheets/_rels/sheet[0-9]+\.xml\.rels',
112 r'^xl/drawings/vmlDrawing[0-9]+\.vml', 125 r'^(?:word|ppt|xl)/drawings/vmlDrawing[0-9]+\.vml',
113 r'^xl/drawings/drawing[0-9]+\.xml', 126 r'^(?:word|ppt|xl)/drawings/drawing[0-9]+\.xml',
127 r'^(?:word|ppt|xl)/embeddings/Microsoft_Excel_Worksheet[0-9]+\.xlsx',
128 # rels for complicated powerpoints
129 r'^ppt/notesSlides/_rels/notesSlide[0-9]+\.xml\.rels',
130 r'^ppt/notesMasters/_rels/notesMaster[0-9]+\.xml\.rels',
131 r'^ppt/handoutMasters/_rels/handoutMaster[0-9]+\.xml\.rels',
114 })) 132 }))
115 self.files_to_omit = set(map(re.compile, { # type: ignore 133 self.files_to_omit = set(map(re.compile, { # type: ignore
116 r'^\[trash\]/', 134 r'^\[trash\]/',
@@ -120,18 +138,24 @@ class MSOfficeParser(ZipParser):
120 r'^(?:word|ppt|xl)/printerSettings/', 138 r'^(?:word|ppt|xl)/printerSettings/',
121 r'^(?:word|ppt|xl)/theme', 139 r'^(?:word|ppt|xl)/theme',
122 r'^(?:word|ppt|xl)/people\.xml$', 140 r'^(?:word|ppt|xl)/people\.xml$',
141 r'^(?:word|ppt|xl)/persons/person\.xml$',
123 r'^(?:word|ppt|xl)/numbering\.xml$', 142 r'^(?:word|ppt|xl)/numbering\.xml$',
124 r'^(?:word|ppt|xl)/tags/', 143 r'^(?:word|ppt|xl)/tags/',
144 r'^(?:word|ppt|xl)/glossary/',
125 # View properties like view mode, last viewed slide etc 145 # View properties like view mode, last viewed slide etc
126 r'^(?:word|ppt|xl)/viewProps\.xml$', 146 r'^(?:word|ppt|xl)/viewProps\.xml$',
127 # Additional presentation-wide properties like printing properties, 147 # Additional presentation-wide properties like printing properties,
128 # presentation show properties etc. 148 # presentation show properties etc.
129 r'^(?:word|ppt|xl)/presProps\.xml$', 149 r'^(?:word|ppt|xl)/presProps\.xml$',
130 r'^(?:word|ppt|xl)/comments[0-9]+\.xml$', 150 r'^(?:word|ppt|xl)/comments[0-9]+\.xml$',
131 151 r'^(?:word|ppt|xl)/threadedComments/threadedComment[0-9]*\.xml$',
152 r'^(?:word|ppt|xl)/commentsExtended\.xml$',
153 r'^(?:word|ppt|xl)/commentsExtensible\.xml$',
154 r'^(?:word|ppt|xl)/commentsIds\.xml$',
132 # we have an allowlist in self.files_to_keep, 155 # we have an allowlist in self.files_to_keep,
133 # so we can trash everything else 156 # so we can trash everything else
134 r'^(?:word|ppt|xl)/_rels/', 157 r'^(?:word|ppt|xl)/_rels/',
158 r'docMetadata/LabelInfo\.xml$'
135 })) 159 }))
136 160
137 if self.__fill_files_to_keep_via_content_types() is False: 161 if self.__fill_files_to_keep_via_content_types() is False: