Refactor a bit office get_meta handling

This should make easier to get more metadata from archive-based file formats.
author: jvoisin 2019-02-03 22:55:15 +0100
committer: jvoisin 2019-02-04 00:31:26 +0100
commit: b9a62d798af14ea799ae5fceab1ed7a537d1cbdd (patch)
tree: a50622baf990acface31398adaef395bb398ed5d
parent: 54e50450ad9f8657ed7c60d5c0f9ab5c648d08ee (diff)
3 files changed, 33 insertions, 24 deletions
diff --git a/libmat2/archive.py b/libmat2/archive.py
index b2483fc..d155664 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -4,7 +4,7 @@ import tempfile
 import os
 import logging
 import shutil
-from typing import Dict, Set, Pattern, Union
+from typing import Dict, Set, Pattern, Union, Any
 from . import abstract, UnknownMemberPolicy, parser_factory
@@ -42,6 +42,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
        # pylint: disable=unused-argument,no-self-use
        return True  # pragma: no cover
+    def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
+        """ This method can be used to extract specific metadata
+        from files present in the archive."""
+        # pylint: disable=unused-argument,no-self-use
+        return {}  # pragma: no cover
    @staticmethod
    def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
        zipinfo.create_system = 3  # Linux
@@ -74,6 +80,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
            temp_folder = tempfile.mkdtemp()
            for item in zin.infolist():
+                local_meta = dict()  # type: Dict[str, Union[str, Dict]]
+                for k, v in self._get_zipinfo_meta(item).items():
+                    local_meta[k] = v
                if item.filename[-1] == '/':  # pragma: no cover
                    # `is_dir` is added in Python3.6
                    continue  # don't keep empty folders
@@ -81,11 +91,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                zin.extract(member=item, path=temp_folder)
                full_path = os.path.join(temp_folder, item.filename)
+                specific_meta = self._specific_get_meta(full_path, item.filename)
+                for (k, v) in specific_meta.items():
+                    local_meta[k] = v
                tmp_parser, _ = parser_factory.get_parser(full_path)  # type: ignore
-                if not tmp_parser:
+                if tmp_parser:
-                    continue
+                    for k, v in tmp_parser.get_meta().items():
+                        local_meta[k] = v
-                local_meta = tmp_parser.get_meta()
                if local_meta:
                    meta[item.filename] = local_meta
diff --git a/libmat2/office.py b/libmat2/office.py
index 365c230..dfad3b3 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
 import os
 import re
 import zipfile
-from typing import Dict, Set, Pattern, Tuple, Union
+from typing import Dict, Set, Pattern, Tuple, Union, Any
 import xml.etree.ElementTree as ET  # type: ignore
@@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
        return True
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
        """
-        metadata = super().get_meta()
+        if not file_path.startswith('docProps/') or not file_path.endswith('.xml'):
-        zipin = zipfile.ZipFile(self.filename)
+            return {}
-        for item in zipin.infolist():
-            if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
+        with open(full_path, encoding='utf-8') as f:
-                try:
+            try:
-                    content = zipin.read(item).decode('utf-8')
+                results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M)
-                    results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
+                return {k:v for (k, v) in results}
-                    for (key, value) in results:
+            except (TypeError, UnicodeDecodeError):
-                        metadata[key] = value
+                # We didn't manage to parse the xml file
-                except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
+                return {file_path: 'harmful content', }
-                    metadata[item.filename] = 'harmful content'
-            for key, value in self._get_zipinfo_meta(item).items():
-                metadata[key] = value
-        zipin.close()
-        return metadata
 class LibreOfficeParser(ArchiveBasedAbstractParser):
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 9354286..d692181 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -131,9 +131,9 @@ class TestGetMeta(unittest.TestCase):
    def test_docx(self):
        p = office.MSOfficeParser('./tests/data/dirty.docx')
        meta = p.get_meta()
-        self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')
+        self.assertEqual(meta['docProps/core.xml']['cp:lastModifiedBy'], 'Julien Voisin')
-        self.assertEqual(meta['dc:creator'], 'julien voisin')
+        self.assertEqual(meta['docProps/core.xml']['dc:creator'], 'julien voisin')
-        self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
+        self.assertEqual(meta['docProps/app.xml']['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
    def test_libreoffice(self):
        p = office.LibreOfficeParser('./tests/data/dirty.odt')
author	jvoisin	2019-02-03 22:55:15 +0100
committer	jvoisin	2019-02-04 00:31:26 +0100
commit	b9a62d798af14ea799ae5fceab1ed7a537d1cbdd (patch)
tree	a50622baf990acface31398adaef395bb398ed5d
parent	54e50450ad9f8657ed7c60d5c0f9ab5c648d08ee (diff)

diff --git a/libmat2/archive.py b/libmat2/archive.py index b2483fc..d155664 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py
@@ -4,7 +4,7 @@ import tempfile
4	import os	4	import os
5	import logging	5	import logging
6	import shutil	6	import shutil
7	from typing import Dict, Set, Pattern, Union	7	from typing import Dict, Set, Pattern, Union, Any
8		8
9	from . import abstract, UnknownMemberPolicy, parser_factory	9	from . import abstract, UnknownMemberPolicy, parser_factory
10		10
@@ -42,6 +42,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
42	# pylint: disable=unused-argument,no-self-use	42	# pylint: disable=unused-argument,no-self-use
43	return True # pragma: no cover	43	return True # pragma: no cover
44		44
		45	def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
		46	""" This method can be used to extract specific metadata
		47	from files present in the archive."""
		48	# pylint: disable=unused-argument,no-self-use
		49	return {} # pragma: no cover
		50
45	@staticmethod	51	@staticmethod
46	def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:	52	def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
47	zipinfo.create_system = 3 # Linux	53	zipinfo.create_system = 3 # Linux
@@ -74,6 +80,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
74	temp_folder = tempfile.mkdtemp()	80	temp_folder = tempfile.mkdtemp()
75		81
76	for item in zin.infolist():	82	for item in zin.infolist():
		83	local_meta = dict() # type: Dict[str, Union[str, Dict]]
		84	for k, v in self._get_zipinfo_meta(item).items():
		85	local_meta[k] = v
		86
77	if item.filename[-1] == '/': # pragma: no cover	87	if item.filename[-1] == '/': # pragma: no cover
78	# `is_dir` is added in Python3.6	88	# `is_dir` is added in Python3.6
79	continue # don't keep empty folders	89	continue # don't keep empty folders
@@ -81,11 +91,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
81	zin.extract(member=item, path=temp_folder)	91	zin.extract(member=item, path=temp_folder)
82	full_path = os.path.join(temp_folder, item.filename)	92	full_path = os.path.join(temp_folder, item.filename)
83		93
		94	specific_meta = self._specific_get_meta(full_path, item.filename)
		95	for (k, v) in specific_meta.items():
		96	local_meta[k] = v
		97
84	tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore	98	tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore
85	if not tmp_parser:	99	if tmp_parser:
86	continue	100	for k, v in tmp_parser.get_meta().items():
		101	local_meta[k] = v
87		102
88	local_meta = tmp_parser.get_meta()
89	if local_meta:	103	if local_meta:
90	meta[item.filename] = local_meta	104	meta[item.filename] = local_meta
91		105


diff --git a/libmat2/office.py b/libmat2/office.py index 365c230..dfad3b3 100644 --- a/libmat2/office.py +++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
2	import os	2	import os
3	import re	3	import re
4	import zipfile	4	import zipfile
5	from typing import Dict, Set, Pattern, Tuple, Union	5	from typing import Dict, Set, Pattern, Tuple, Union, Any
6		6
7	import xml.etree.ElementTree as ET # type: ignore	7	import xml.etree.ElementTree as ET # type: ignore
8		8
@@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
295		295
296	return True	296	return True
297		297
298	def get_meta(self) -> Dict[str, Union[str, dict]]:	298	def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
299	"""	299	"""
300	Yes, I know that parsing xml with regexp ain't pretty,	300	Yes, I know that parsing xml with regexp ain't pretty,
301	be my guest and fix it if you want.	301	be my guest and fix it if you want.
302	"""	302	"""
303	metadata = super().get_meta()	303	if not file_path.startswith('docProps/') or not file_path.endswith('.xml'):
304	zipin = zipfile.ZipFile(self.filename)	304	return {}
305	for item in zipin.infolist():	305
306	if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):	306	with open(full_path, encoding='utf-8') as f:
307	try:	307	try:
308	content = zipin.read(item).decode('utf-8')	308	results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I\|re.M)
309	results = re.findall(r"<(.+)>(.+)</\1>", content, re.I\|re.M)	309	return {k:v for (k, v) in results}
310	for (key, value) in results:	310	except (TypeError, UnicodeDecodeError):
311	metadata[key] = value	311	# We didn't manage to parse the xml file
312	except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file	312	return {file_path: 'harmful content', }
313	metadata[item.filename] = 'harmful content'
314	for key, value in self._get_zipinfo_meta(item).items():
315	metadata[key] = value
316	zipin.close()
317	return metadata
318		313
319		314
320	class LibreOfficeParser(ArchiveBasedAbstractParser):	315	class LibreOfficeParser(ArchiveBasedAbstractParser):


diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 9354286..d692181 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py
@@ -131,9 +131,9 @@ class TestGetMeta(unittest.TestCase):
131	def test_docx(self):	131	def test_docx(self):
132	p = office.MSOfficeParser('./tests/data/dirty.docx')	132	p = office.MSOfficeParser('./tests/data/dirty.docx')
133	meta = p.get_meta()	133	meta = p.get_meta()
134	self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')	134	self.assertEqual(meta['docProps/core.xml']['cp:lastModifiedBy'], 'Julien Voisin')
135	self.assertEqual(meta['dc:creator'], 'julien voisin')	135	self.assertEqual(meta['docProps/core.xml']['dc:creator'], 'julien voisin')
136	self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')	136	self.assertEqual(meta['docProps/app.xml']['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
137		137
138	def test_libreoffice(self):	138	def test_libreoffice(self):
139	p = office.LibreOfficeParser('./tests/data/dirty.odt')	139	p = office.LibreOfficeParser('./tests/data/dirty.odt')