summaryrefslogtreecommitdiff
path: root/libmat2
diff options
context:
space:
mode:
authorjvoisin2019-02-07 21:58:10 +0100
committerjvoisin2019-02-07 22:19:37 +0100
commite1dd439fc86ba15816e2331e8bed67dd7147e368 (patch)
tree0c8e368fcb9c409fa2182018b166ec4f18cdd98c /libmat2
parentb9a62d798af14ea799ae5fceab1ed7a537d1cbdd (diff)
Use of the archive refactoring for the office documents too
Diffstat (limited to 'libmat2')
-rw-r--r--libmat2/office.py28
1 files changed, 11 insertions, 17 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index dfad3b3..0c9caa8 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
2import os 2import os
3import re 3import re
4import zipfile 4import zipfile
5from typing import Dict, Set, Pattern, Tuple, Union, Any 5from typing import Dict, Set, Pattern, Tuple, Any
6 6
7import xml.etree.ElementTree as ET # type: ignore 7import xml.etree.ElementTree as ET # type: ignore
8 8
@@ -375,23 +375,17 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
375 return False 375 return False
376 return True 376 return True
377 377
378 def get_meta(self) -> Dict[str, Union[str, dict]]: 378 def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
379 """ 379 """
380 Yes, I know that parsing xml with regexp ain't pretty, 380 Yes, I know that parsing xml with regexp ain't pretty,
381 be my guest and fix it if you want. 381 be my guest and fix it if you want.
382 """ 382 """
383 metadata = {} 383 if file_path != 'meta.xml':
384 zipin = zipfile.ZipFile(self.filename) 384 return {}
385 for item in zipin.infolist(): 385 with open(full_path, encoding='utf-8') as f:
386 if item.filename == 'meta.xml': 386 try:
387 try: 387 results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", f.read(), re.I|re.M)
388 content = zipin.read(item).decode('utf-8') 388 return {k:v for (k, v) in results}
389 results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) 389 except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
390 for (key, value) in results: 390 # We didn't manage to parse the xml file
391 metadata[key] = value 391 return {file_path: 'harmful content', }
392 except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
393 metadata[item.filename] = 'harmful content'
394 for key, value in self._get_zipinfo_meta(item).items():
395 metadata[key] = value
396 zipin.close()
397 return metadata