summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjvoisin2019-02-03 22:55:15 +0100
committerjvoisin2019-02-04 00:31:26 +0100
commitb9a62d798af14ea799ae5fceab1ed7a537d1cbdd (patch)
treea50622baf990acface31398adaef395bb398ed5d
parent54e50450ad9f8657ed7c60d5c0f9ab5c648d08ee (diff)
Refactor a bit office get_meta handling
This should make easier to get more metadata from archive-based file formats.
-rw-r--r--libmat2/archive.py22
-rw-r--r--libmat2/office.py29
-rw-r--r--tests/test_libmat2.py6
3 files changed, 33 insertions, 24 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py
index b2483fc..d155664 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -4,7 +4,7 @@ import tempfile
4import os 4import os
5import logging 5import logging
6import shutil 6import shutil
7from typing import Dict, Set, Pattern, Union 7from typing import Dict, Set, Pattern, Union, Any
8 8
9from . import abstract, UnknownMemberPolicy, parser_factory 9from . import abstract, UnknownMemberPolicy, parser_factory
10 10
@@ -42,6 +42,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
42 # pylint: disable=unused-argument,no-self-use 42 # pylint: disable=unused-argument,no-self-use
43 return True # pragma: no cover 43 return True # pragma: no cover
44 44
45 def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
46 """ This method can be used to extract specific metadata
47 from files present in the archive."""
48 # pylint: disable=unused-argument,no-self-use
49 return {} # pragma: no cover
50
45 @staticmethod 51 @staticmethod
46 def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: 52 def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
47 zipinfo.create_system = 3 # Linux 53 zipinfo.create_system = 3 # Linux
@@ -74,6 +80,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
74 temp_folder = tempfile.mkdtemp() 80 temp_folder = tempfile.mkdtemp()
75 81
76 for item in zin.infolist(): 82 for item in zin.infolist():
83 local_meta = dict() # type: Dict[str, Union[str, Dict]]
84 for k, v in self._get_zipinfo_meta(item).items():
85 local_meta[k] = v
86
77 if item.filename[-1] == '/': # pragma: no cover 87 if item.filename[-1] == '/': # pragma: no cover
78 # `is_dir` is added in Python3.6 88 # `is_dir` is added in Python3.6
79 continue # don't keep empty folders 89 continue # don't keep empty folders
@@ -81,11 +91,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
81 zin.extract(member=item, path=temp_folder) 91 zin.extract(member=item, path=temp_folder)
82 full_path = os.path.join(temp_folder, item.filename) 92 full_path = os.path.join(temp_folder, item.filename)
83 93
94 specific_meta = self._specific_get_meta(full_path, item.filename)
95 for (k, v) in specific_meta.items():
96 local_meta[k] = v
97
84 tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore 98 tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore
85 if not tmp_parser: 99 if tmp_parser:
86 continue 100 for k, v in tmp_parser.get_meta().items():
101 local_meta[k] = v
87 102
88 local_meta = tmp_parser.get_meta()
89 if local_meta: 103 if local_meta:
90 meta[item.filename] = local_meta 104 meta[item.filename] = local_meta
91 105
diff --git a/libmat2/office.py b/libmat2/office.py
index 365c230..dfad3b3 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
2import os 2import os
3import re 3import re
4import zipfile 4import zipfile
5from typing import Dict, Set, Pattern, Tuple, Union 5from typing import Dict, Set, Pattern, Tuple, Union, Any
6 6
7import xml.etree.ElementTree as ET # type: ignore 7import xml.etree.ElementTree as ET # type: ignore
8 8
@@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
295 295
296 return True 296 return True
297 297
298 def get_meta(self) -> Dict[str, Union[str, dict]]: 298 def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
299 """ 299 """
300 Yes, I know that parsing xml with regexp ain't pretty, 300 Yes, I know that parsing xml with regexp ain't pretty,
301 be my guest and fix it if you want. 301 be my guest and fix it if you want.
302 """ 302 """
303 metadata = super().get_meta() 303 if not file_path.startswith('docProps/') or not file_path.endswith('.xml'):
304 zipin = zipfile.ZipFile(self.filename) 304 return {}
305 for item in zipin.infolist(): 305
306 if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): 306 with open(full_path, encoding='utf-8') as f:
307 try: 307 try:
308 content = zipin.read(item).decode('utf-8') 308 results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M)
309 results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) 309 return {k:v for (k, v) in results}
310 for (key, value) in results: 310 except (TypeError, UnicodeDecodeError):
311 metadata[key] = value 311 # We didn't manage to parse the xml file
312 except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file 312 return {file_path: 'harmful content', }
313 metadata[item.filename] = 'harmful content'
314 for key, value in self._get_zipinfo_meta(item).items():
315 metadata[key] = value
316 zipin.close()
317 return metadata
318 313
319 314
320class LibreOfficeParser(ArchiveBasedAbstractParser): 315class LibreOfficeParser(ArchiveBasedAbstractParser):
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 9354286..d692181 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -131,9 +131,9 @@ class TestGetMeta(unittest.TestCase):
131 def test_docx(self): 131 def test_docx(self):
132 p = office.MSOfficeParser('./tests/data/dirty.docx') 132 p = office.MSOfficeParser('./tests/data/dirty.docx')
133 meta = p.get_meta() 133 meta = p.get_meta()
134 self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin') 134 self.assertEqual(meta['docProps/core.xml']['cp:lastModifiedBy'], 'Julien Voisin')
135 self.assertEqual(meta['dc:creator'], 'julien voisin') 135 self.assertEqual(meta['docProps/core.xml']['dc:creator'], 'julien voisin')
136 self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') 136 self.assertEqual(meta['docProps/app.xml']['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
137 137
138 def test_libreoffice(self): 138 def test_libreoffice(self):
139 p = office.LibreOfficeParser('./tests/data/dirty.odt') 139 p = office.LibreOfficeParser('./tests/data/dirty.odt')