summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2019-02-07 21:58:10 +0100
committerjvoisin2019-02-07 22:19:37 +0100
commite1dd439fc86ba15816e2331e8bed67dd7147e368 (patch)
tree0c8e368fcb9c409fa2182018b166ec4f18cdd98c
parentb9a62d798af14ea799ae5fceab1ed7a537d1cbdd (diff)
Use of the archive refactoring for the office documents too
-rw-r--r--libmat2/office.py28
-rw-r--r--tests/test_corrupted_files.py7
-rw-r--r--tests/test_libmat2.py14
3 files changed, 19 insertions, 30 deletions
diff --git a/libmat2/office.py b/libmat2/office.py
index dfad3b3..0c9caa8 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
2import os 2import os
3import re 3import re
4import zipfile 4import zipfile
5from typing import Dict, Set, Pattern, Tuple, Union, Any 5from typing import Dict, Set, Pattern, Tuple, Any
6 6
7import xml.etree.ElementTree as ET # type: ignore 7import xml.etree.ElementTree as ET # type: ignore
8 8
@@ -375,23 +375,17 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
375 return False 375 return False
376 return True 376 return True
377 377
378 def get_meta(self) -> Dict[str, Union[str, dict]]: 378 def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
379 """ 379 """
380 Yes, I know that parsing xml with regexp ain't pretty, 380 Yes, I know that parsing xml with regexp ain't pretty,
381 be my guest and fix it if you want. 381 be my guest and fix it if you want.
382 """ 382 """
383 metadata = {} 383 if file_path != 'meta.xml':
384 zipin = zipfile.ZipFile(self.filename) 384 return {}
385 for item in zipin.infolist(): 385 with open(full_path, encoding='utf-8') as f:
386 if item.filename == 'meta.xml': 386 try:
387 try: 387 results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", f.read(), re.I|re.M)
388 content = zipin.read(item).decode('utf-8') 388 return {k:v for (k, v) in results}
389 results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) 389 except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
390 for (key, value) in results: 390 # We didn't manage to parse the xml file
391 metadata[key] = value 391 return {file_path: 'harmful content', }
392 except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
393 metadata[item.filename] = 'harmful content'
394 for key, value in self._get_zipinfo_meta(item).items():
395 metadata[key] = value
396 zipin.close()
397 return metadata
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index e7d3c2a..b2e7798 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -67,15 +67,10 @@ class TestCorruptedEmbedded(unittest.TestCase):
67 os.remove('./tests/data/clean.docx') 67 os.remove('./tests/data/clean.docx')
68 68
69 def test_odt(self): 69 def test_odt(self):
70 expected = {
71 'create_system': 'Weird',
72 'date_time': '2018-06-10 17:18:18',
73 'meta.xml': 'harmful content'
74 }
75 shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt') 70 shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt')
76 parser, _ = parser_factory.get_parser('./tests/data/clean.odt') 71 parser, _ = parser_factory.get_parser('./tests/data/clean.odt')
77 self.assertFalse(parser.remove_all()) 72 self.assertFalse(parser.remove_all())
78 self.assertEqual(parser.get_meta(), expected) 73 self.assertTrue(parser.get_meta())
79 os.remove('./tests/data/clean.odt') 74 os.remove('./tests/data/clean.odt')
80 75
81 76
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index d692181..548b076 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -138,14 +138,14 @@ class TestGetMeta(unittest.TestCase):
138 def test_libreoffice(self): 138 def test_libreoffice(self):
139 p = office.LibreOfficeParser('./tests/data/dirty.odt') 139 p = office.LibreOfficeParser('./tests/data/dirty.odt')
140 meta = p.get_meta() 140 meta = p.get_meta()
141 self.assertEqual(meta['meta:initial-creator'], 'jvoisin ') 141 self.assertEqual(meta['meta.xml']['meta:initial-creator'], 'jvoisin ')
142 self.assertEqual(meta['meta:creation-date'], '2011-07-26T03:27:48') 142 self.assertEqual(meta['meta.xml']['meta:creation-date'], '2011-07-26T03:27:48')
143 self.assertEqual(meta['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202') 143 self.assertEqual(meta['meta.xml']['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202')
144 144
145 p = office.LibreOfficeParser('./tests/data/weird_producer.odt') 145 p = office.LibreOfficeParser('./tests/data/weird_producer.odt')
146 meta = p.get_meta() 146 meta = p.get_meta()
147 self.assertEqual(meta['create_system'], 'Windows') 147 self.assertEqual(meta['mimetype']['create_system'], 'Windows')
148 self.assertEqual(meta['comment'], b'YAY FOR COMMENTS') 148 self.assertEqual(meta['mimetype']['comment'], b'YAY FOR COMMENTS')
149 149
150 def test_txt(self): 150 def test_txt(self):
151 p, mimetype = parser_factory.get_parser('./tests/data/dirty.txt') 151 p, mimetype = parser_factory.get_parser('./tests/data/dirty.txt')
@@ -440,7 +440,7 @@ class TestCleaning(unittest.TestCase):
440 p = office.LibreOfficeParser('./tests/data/clean.odf') 440 p = office.LibreOfficeParser('./tests/data/clean.odf')
441 441
442 meta = p.get_meta() 442 meta = p.get_meta()
443 self.assertEqual(meta['meta:creation-date'], '2018-04-23T00:18:59.438231281') 443 self.assertEqual(meta['meta.xml']['meta:creation-date'], '2018-04-23T00:18:59.438231281')
444 444
445 ret = p.remove_all() 445 ret = p.remove_all()
446 self.assertTrue(ret) 446 self.assertTrue(ret)
@@ -458,7 +458,7 @@ class TestCleaning(unittest.TestCase):
458 p = office.LibreOfficeParser('./tests/data/clean.odg') 458 p = office.LibreOfficeParser('./tests/data/clean.odg')
459 459
460 meta = p.get_meta() 460 meta = p.get_meta()
461 self.assertEqual(meta['dc:date'], '2018-04-23T00:26:59.385838550') 461 self.assertEqual(meta['meta.xml']['dc:date'], '2018-04-23T00:26:59.385838550')
462 462
463 ret = p.remove_all() 463 ret = p.remove_all()
464 self.assertTrue(ret) 464 self.assertTrue(ret)