summaryrefslogtreecommitdiff
path: root/libmat2/office.py
diff options
context:
space:
mode:
authorjvoisin2019-02-03 22:55:15 +0100
committerjvoisin2019-02-04 00:31:26 +0100
commitb9a62d798af14ea799ae5fceab1ed7a537d1cbdd (patch)
treea50622baf990acface31398adaef395bb398ed5d /libmat2/office.py
parent54e50450ad9f8657ed7c60d5c0f9ab5c648d08ee (diff)
Refactor a bit office get_meta handling
This should make easier to get more metadata from archive-based file formats.
Diffstat (limited to 'libmat2/office.py')
-rw-r--r--libmat2/office.py29
1 files changed, 12 insertions, 17 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index 365c230..dfad3b3 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
2import os 2import os
3import re 3import re
4import zipfile 4import zipfile
5from typing import Dict, Set, Pattern, Tuple, Union 5from typing import Dict, Set, Pattern, Tuple, Union, Any
6 6
7import xml.etree.ElementTree as ET # type: ignore 7import xml.etree.ElementTree as ET # type: ignore
8 8
@@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
295 295
296 return True 296 return True
297 297
298 def get_meta(self) -> Dict[str, Union[str, dict]]: 298 def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
299 """ 299 """
300 Yes, I know that parsing xml with regexp ain't pretty, 300 Yes, I know that parsing xml with regexp ain't pretty,
301 be my guest and fix it if you want. 301 be my guest and fix it if you want.
302 """ 302 """
303 metadata = super().get_meta() 303 if not file_path.startswith('docProps/') or not file_path.endswith('.xml'):
304 zipin = zipfile.ZipFile(self.filename) 304 return {}
305 for item in zipin.infolist(): 305
306 if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): 306 with open(full_path, encoding='utf-8') as f:
307 try: 307 try:
308 content = zipin.read(item).decode('utf-8') 308 results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M)
309 results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) 309 return {k:v for (k, v) in results}
310 for (key, value) in results: 310 except (TypeError, UnicodeDecodeError):
311 metadata[key] = value 311 # We didn't manage to parse the xml file
312 except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file 312 return {file_path: 'harmful content', }
313 metadata[item.filename] = 'harmful content'
314 for key, value in self._get_zipinfo_meta(item).items():
315 metadata[key] = value
316 zipin.close()
317 return metadata
318 313
319 314
320class LibreOfficeParser(ArchiveBasedAbstractParser): 315class LibreOfficeParser(ArchiveBasedAbstractParser):