2 files changed, 104 insertions, 29 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
index 09b7937..d385465 100644
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -1,11 +1,13 @@
 import logging
 import re
+import uuid
 import xml.etree.ElementTree as ET  # type: ignore
 from . import archive, office
 class EPUBParser(archive.ArchiveBasedAbstractParser):
    mimetypes = {'application/epub+zip', }
+    metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
    def __init__(self, filename):
        super().__init__(filename)
@@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
            'mimetype',
            'OEBPS/content.opf',
            }))
+        self.uniqid = uuid.uuid4()
    def _specific_get_meta(self, full_path, file_path):
        if file_path != 'OEBPS/content.opf':
@@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
                                     f.read(), re.I|re.M)
                return {k:v for (k, v) in results}
            except (TypeError, UnicodeDecodeError):
-                # We didn't manage to parse the xml file
                return {file_path: 'harmful content', }
    def _specific_cleanup(self, full_path: str):
-        if not full_path.endswith('OEBPS/content.opf'):
+        if full_path.endswith('OEBPS/content.opf'):
-            return True
+            return self.__handle_contentopf(full_path)
+        elif full_path.endswith('OEBPS/toc.ncx'):
+            return self.__handle_tocncx(full_path)
+        return True
+    def __handle_tocncx(self, full_path: str):
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:  # pragma: nocover
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
+            if item.tag.strip().lower().endswith('head'):
+                item.clear()
+                ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
+                break
+        tree.write(full_path, xml_declaration=True, encoding='utf-8',
+                   short_empty_elements=False)
+        return True
+    def __handle_contentopf(self, full_path: str):
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:
            logging.error("Unable to parse %s in %s.", full_path, self.filename)
            return False
-        parent_map = {c:p for p in tree.iter() for c in p}
-        for item in tree.iterfind('.//', namespace):
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
            if item.tag.strip().lower().endswith('metadata'):
-                parent_map[item].remove(item)
+                item.clear()
+                # item with mandatory content
+                uniqid = ET.Element(self.metadata_namespace + 'identifier')
+                uniqid.text = str(self.uniqid)
+                uniqid.set('id', 'id')
+                item.append(uniqid)
+                # items without mandatory content
+                for name in {'language', 'title'}:
+                    uniqid = ET.Element(self.metadata_namespace + name)
+                    item.append(uniqid)
                break  # there is only a single <metadata> block
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
diff --git a/libmat2/web.py b/libmat2/web.py
index c11b47d..067f5f9 100644
--- a/libmat2/web.py
+++ b/libmat2/web.py
@@ -1,10 +1,13 @@
-from html import parser
+from html import parser, escape
-from typing import Dict, Any, List, Tuple
+from typing import Dict, Any, List, Tuple, Set
 import re
 import string
 from . import abstract
+assert Set
+# pylint: disable=too-many-instance-attributes
 class CSSParser(abstract.AbstractParser):
    """There is no such things as metadata in CSS files,
@@ -33,11 +36,16 @@ class CSSParser(abstract.AbstractParser):
        return metadata
-class HTMLParser(abstract.AbstractParser):
+class AbstractHTMLParser(abstract.AbstractParser):
-    mimetypes = {'text/html', 'application/x-dtbncx+xml', }
+    tags_blacklist = set()  # type: Set[str]
+    # In some html/xml based formats some tags are mandatory,
+    # so we're keeping them, but are discaring their contents
+    tags_required_blacklist = set()  # type: Set[str]
    def __init__(self, filename):
        super().__init__(filename)
-        self.__parser = _HTMLParser(self.filename)
+        self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
+                                    self.tags_required_blacklist)
        with open(filename, encoding='utf-8') as f:
            self.__parser.feed(f.read())
        self.__parser.close()
@@ -49,29 +57,50 @@ class HTMLParser(abstract.AbstractParser):
        return self.__parser.remove_all(self.output_filename)
+class HTMLParser(AbstractHTMLParser):
+    mimetypes = {'text/html', }
+    tags_blacklist = {'meta', }
+    tags_required_blacklist = {'title', }
+class DTBNCXParser(AbstractHTMLParser):
+    mimetypes = {'application/x-dtbncx+xml', }
+    tags_required_blacklist = {'title', 'doctitle', 'meta'}
 class _HTMLParser(parser.HTMLParser):
    """Python doesn't have a validating html parser in its stdlib, so
    we're using an internal queue to track all the opening/closing tags,
    and hoping for the best.
    """
-    tag_blacklist = {'doctitle', 'meta', 'title'}  # everything is lowercase
+    def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
-    def __init__(self, filename):
        super().__init__()
        self.filename = filename
        self.__textrepr = ''
        self.__meta = {}
-        self.__validation_queue = []
+        self.__validation_queue = []  # type: List[str]
-        # We're using a counter instead of a boolean to handle nested tags
+        # We're using counters instead of booleans, to handle nested tags
+        self.__in_dangerous_but_required_tag = 0
        self.__in_dangerous_tag = 0
+        if required_blacklisted_tags & blacklisted_tags:  # pragma: nocover
+            raise ValueError("There is an overlap between %s and %s" % (
+                required_blacklisted_tags, blacklisted_tags))
+        self.tag_required_blacklist = required_blacklisted_tags
+        self.tag_blacklist = blacklisted_tags
    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
-        self.__validation_queue.append(tag)
+        original_tag = self.get_starttag_text()
+        self.__validation_queue.append(original_tag)
+        if tag in self.tag_required_blacklist:
+            self.__in_dangerous_but_required_tag += 1
        if tag in self.tag_blacklist:
            self.__in_dangerous_tag += 1
-            return
        if self.__in_dangerous_tag == 0:
-            self.__textrepr += self.get_starttag_text()
+            if self.__in_dangerous_but_required_tag <= 1:
+                self.__textrepr += original_tag
    def handle_endtag(self, tag: str):
        if not self.__validation_queue:
@@ -79,29 +108,43 @@ class _HTMLParser(parser.HTMLParser):
                             "opening one in %s." % (tag, self.filename))
        previous_tag = self.__validation_queue.pop()
-        if tag != previous_tag:
+        previous_tag = previous_tag[1:-1]  # remove < and >
+        previous_tag = previous_tag.split(' ')[0]  # remove attributes
+        if tag != previous_tag.lower():
            raise ValueError("The closing tag %s doesn't match the previous "
                             "tag %s in %s" %
                             (tag, previous_tag, self.filename))
-        elif tag in self.tag_blacklist:
-            self.__in_dangerous_tag -= 1
-            return
        if self.__in_dangerous_tag == 0:
-            # There is no `get_endtag_text()` method :/
+            if self.__in_dangerous_but_required_tag <= 1:
-            self.__textrepr += '</' + tag + '>\n'
+                # There is no `get_endtag_text()` method :/
+                self.__textrepr += '</' + previous_tag + '>'
+        if tag in self.tag_required_blacklist:
+            self.__in_dangerous_but_required_tag -= 1
+        elif tag in self.tag_blacklist:
+            self.__in_dangerous_tag -= 1
    def handle_data(self, data: str):
-        if self.__in_dangerous_tag == 0 and data.strip():
+        if self.__in_dangerous_but_required_tag == 0:
-            self.__textrepr += data
+            if self.__in_dangerous_tag == 0:
+                if data.strip():
+                    self.__textrepr += escape(data)
    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
-        if tag in self.tag_blacklist:
+        if tag in self.tag_required_blacklist | self.tag_blacklist:
            meta = {k:v for k, v in attrs}
            name = meta.get('name', 'harmful metadata')
            content = meta.get('content', 'harmful data')
            self.__meta[name] = content
-        else:
+            if self.__in_dangerous_tag != 0:
+                return
+            elif tag in self.tag_required_blacklist:
+                self.__textrepr += '<' + tag + ' />'
+            return
+        if self.__in_dangerous_but_required_tag == 0:
            if self.__in_dangerous_tag == 0:
                self.__textrepr += self.get_starttag_text()

diff --git a/libmat2/epub.py b/libmat2/epub.py index 09b7937..d385465 100644 --- a/libmat2/epub.py +++ b/libmat2/epub.py
@@ -1,11 +1,13 @@
1	import logging	1	import logging
2	import re	2	import re
		3	import uuid
3	import xml.etree.ElementTree as ET # type: ignore	4	import xml.etree.ElementTree as ET # type: ignore
4		5
5	from . import archive, office	6	from . import archive, office
6		7
7	class EPUBParser(archive.ArchiveBasedAbstractParser):	8	class EPUBParser(archive.ArchiveBasedAbstractParser):
8	mimetypes = {'application/epub+zip', }	9	mimetypes = {'application/epub+zip', }
		10	metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
9		11
10	def __init__(self, filename):	12	def __init__(self, filename):
11	super().__init__(filename)	13	super().__init__(filename)
@@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
14	'mimetype',	16	'mimetype',
15	'OEBPS/content.opf',	17	'OEBPS/content.opf',
16	}))	18	}))
		19	self.uniqid = uuid.uuid4()
17		20
18	def _specific_get_meta(self, full_path, file_path):	21	def _specific_get_meta(self, full_path, file_path):
19	if file_path != 'OEBPS/content.opf':	22	if file_path != 'OEBPS/content.opf':
@@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
25	f.read(), re.I\|re.M)	28	f.read(), re.I\|re.M)
26	return {k:v for (k, v) in results}	29	return {k:v for (k, v) in results}
27	except (TypeError, UnicodeDecodeError):	30	except (TypeError, UnicodeDecodeError):
28	# We didn't manage to parse the xml file
29	return {file_path: 'harmful content', }	31	return {file_path: 'harmful content', }
30		32
31	def _specific_cleanup(self, full_path: str):	33	def _specific_cleanup(self, full_path: str):
32	if not full_path.endswith('OEBPS/content.opf'):	34	if full_path.endswith('OEBPS/content.opf'):
33	return True	35	return self.__handle_contentopf(full_path)
		36	elif full_path.endswith('OEBPS/toc.ncx'):
		37	return self.__handle_tocncx(full_path)
		38	return True
		39
		40	def __handle_tocncx(self, full_path: str):
		41	try:
		42	tree, namespace = office._parse_xml(full_path)
		43	except ET.ParseError: # pragma: nocover
		44	logging.error("Unable to parse %s in %s.", full_path, self.filename)
		45	return False
		46
		47	for item in tree.iterfind('.//', namespace): # pragma: nocover
		48	if item.tag.strip().lower().endswith('head'):
		49	item.clear()
		50	ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
		51	break
		52	tree.write(full_path, xml_declaration=True, encoding='utf-8',
		53	short_empty_elements=False)
		54	return True
34		55
		56	def __handle_contentopf(self, full_path: str):
35	try:	57	try:
36	tree, namespace = office._parse_xml(full_path)	58	tree, namespace = office._parse_xml(full_path)
37	except ET.ParseError:	59	except ET.ParseError:
38	logging.error("Unable to parse %s in %s.", full_path, self.filename)	60	logging.error("Unable to parse %s in %s.", full_path, self.filename)
39	return False	61	return False
40	parent_map = {c:p for p in tree.iter() for c in p}
41		62
42	for item in tree.iterfind('.//', namespace):	63	for item in tree.iterfind('.//', namespace): # pragma: nocover
43	if item.tag.strip().lower().endswith('metadata'):	64	if item.tag.strip().lower().endswith('metadata'):
44	parent_map[item].remove(item)	65	item.clear()
		66
		67	# item with mandatory content
		68	uniqid = ET.Element(self.metadata_namespace + 'identifier')
		69	uniqid.text = str(self.uniqid)
		70	uniqid.set('id', 'id')
		71	item.append(uniqid)
		72
		73	# items without mandatory content
		74	for name in {'language', 'title'}:
		75	uniqid = ET.Element(self.metadata_namespace + name)
		76	item.append(uniqid)
45	break # there is only a single <metadata> block	77	break # there is only a single <metadata> block
46	tree.write(full_path, xml_declaration=True)	78	tree.write(full_path, xml_declaration=True, encoding='utf-8')
47	return True	79	return True


diff --git a/libmat2/web.py b/libmat2/web.py index c11b47d..067f5f9 100644 --- a/libmat2/web.py +++ b/libmat2/web.py
@@ -1,10 +1,13 @@
1	from html import parser	1	from html import parser, escape
2	from typing import Dict, Any, List, Tuple	2	from typing import Dict, Any, List, Tuple, Set
3	import re	3	import re
4	import string	4	import string
5		5
6	from . import abstract	6	from . import abstract
7		7
		8	assert Set
		9
		10	# pylint: disable=too-many-instance-attributes
8		11
9	class CSSParser(abstract.AbstractParser):	12	class CSSParser(abstract.AbstractParser):
10	"""There is no such things as metadata in CSS files,	13	"""There is no such things as metadata in CSS files,
@@ -33,11 +36,16 @@ class CSSParser(abstract.AbstractParser):
33	return metadata	36	return metadata
34		37
35		38
36	class HTMLParser(abstract.AbstractParser):	39	class AbstractHTMLParser(abstract.AbstractParser):
37	mimetypes = {'text/html', 'application/x-dtbncx+xml', }	40	tags_blacklist = set() # type: Set[str]
		41	# In some html/xml based formats some tags are mandatory,
		42	# so we're keeping them, but are discaring their contents
		43	tags_required_blacklist = set() # type: Set[str]
		44
38	def __init__(self, filename):	45	def __init__(self, filename):
39	super().__init__(filename)	46	super().__init__(filename)
40	self.__parser = _HTMLParser(self.filename)	47	self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
		48	self.tags_required_blacklist)
41	with open(filename, encoding='utf-8') as f:	49	with open(filename, encoding='utf-8') as f:
42	self.__parser.feed(f.read())	50	self.__parser.feed(f.read())
43	self.__parser.close()	51	self.__parser.close()
@@ -49,29 +57,50 @@ class HTMLParser(abstract.AbstractParser):
49	return self.__parser.remove_all(self.output_filename)	57	return self.__parser.remove_all(self.output_filename)
50		58
51		59
		60	class HTMLParser(AbstractHTMLParser):
		61	mimetypes = {'text/html', }
		62	tags_blacklist = {'meta', }
		63	tags_required_blacklist = {'title', }
		64
		65
		66	class DTBNCXParser(AbstractHTMLParser):
		67	mimetypes = {'application/x-dtbncx+xml', }
		68	tags_required_blacklist = {'title', 'doctitle', 'meta'}
		69
		70
52	class _HTMLParser(parser.HTMLParser):	71	class _HTMLParser(parser.HTMLParser):
53	"""Python doesn't have a validating html parser in its stdlib, so	72	"""Python doesn't have a validating html parser in its stdlib, so
54	we're using an internal queue to track all the opening/closing tags,	73	we're using an internal queue to track all the opening/closing tags,
55	and hoping for the best.	74	and hoping for the best.
56	"""	75	"""
57	tag_blacklist = {'doctitle', 'meta', 'title'} # everything is lowercase	76	def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
58	def __init__(self, filename):
59	super().__init__()	77	super().__init__()
60	self.filename = filename	78	self.filename = filename
61	self.__textrepr = ''	79	self.__textrepr = ''
62	self.__meta = {}	80	self.__meta = {}
63	self.__validation_queue = []	81	self.__validation_queue = [] # type: List[str]
64	# We're using a counter instead of a boolean to handle nested tags	82	# We're using counters instead of booleans, to handle nested tags
		83	self.__in_dangerous_but_required_tag = 0
65	self.__in_dangerous_tag = 0	84	self.__in_dangerous_tag = 0
66		85
		86	if required_blacklisted_tags & blacklisted_tags: # pragma: nocover
		87	raise ValueError("There is an overlap between %s and %s" % (
		88	required_blacklisted_tags, blacklisted_tags))
		89	self.tag_required_blacklist = required_blacklisted_tags
		90	self.tag_blacklist = blacklisted_tags
		91
67	def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):	92	def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
68	self.__validation_queue.append(tag)	93	original_tag = self.get_starttag_text()
		94	self.__validation_queue.append(original_tag)
		95
		96	if tag in self.tag_required_blacklist:
		97	self.__in_dangerous_but_required_tag += 1
69	if tag in self.tag_blacklist:	98	if tag in self.tag_blacklist:
70	self.__in_dangerous_tag += 1	99	self.__in_dangerous_tag += 1
71	return
72		100
73	if self.__in_dangerous_tag == 0:	101	if self.__in_dangerous_tag == 0:
74	self.__textrepr += self.get_starttag_text()	102	if self.__in_dangerous_but_required_tag <= 1:
		103	self.__textrepr += original_tag
75		104
76	def handle_endtag(self, tag: str):	105	def handle_endtag(self, tag: str):
77	if not self.__validation_queue:	106	if not self.__validation_queue:
@@ -79,29 +108,43 @@ class _HTMLParser(parser.HTMLParser):
79	"opening one in %s." % (tag, self.filename))	108	"opening one in %s." % (tag, self.filename))
80		109
81	previous_tag = self.__validation_queue.pop()	110	previous_tag = self.__validation_queue.pop()
82	if tag != previous_tag:	111	previous_tag = previous_tag[1:-1] # remove < and >
		112	previous_tag = previous_tag.split(' ')[0] # remove attributes
		113	if tag != previous_tag.lower():
83	raise ValueError("The closing tag %s doesn't match the previous "	114	raise ValueError("The closing tag %s doesn't match the previous "
84	"tag %s in %s" %	115	"tag %s in %s" %
85	(tag, previous_tag, self.filename))	116	(tag, previous_tag, self.filename))
86	elif tag in self.tag_blacklist:
87	self.__in_dangerous_tag -= 1
88	return
89		117
90	if self.__in_dangerous_tag == 0:	118	if self.__in_dangerous_tag == 0:
91	# There is no `get_endtag_text()` method :/	119	if self.__in_dangerous_but_required_tag <= 1:
92	self.__textrepr += '</' + tag + '>\n'	120	# There is no `get_endtag_text()` method :/
		121	self.__textrepr += '</' + previous_tag + '>'
		122
		123	if tag in self.tag_required_blacklist:
		124	self.__in_dangerous_but_required_tag -= 1
		125	elif tag in self.tag_blacklist:
		126	self.__in_dangerous_tag -= 1
93		127
94	def handle_data(self, data: str):	128	def handle_data(self, data: str):
95	if self.__in_dangerous_tag == 0 and data.strip():	129	if self.__in_dangerous_but_required_tag == 0:
96	self.__textrepr += data	130	if self.__in_dangerous_tag == 0:
		131	if data.strip():
		132	self.__textrepr += escape(data)
97		133
98	def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):	134	def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
99	if tag in self.tag_blacklist:	135	if tag in self.tag_required_blacklist \| self.tag_blacklist:
100	meta = {k:v for k, v in attrs}	136	meta = {k:v for k, v in attrs}
101	name = meta.get('name', 'harmful metadata')	137	name = meta.get('name', 'harmful metadata')
102	content = meta.get('content', 'harmful data')	138	content = meta.get('content', 'harmful data')
103	self.__meta[name] = content	139	self.__meta[name] = content
104	else:	140
		141	if self.__in_dangerous_tag != 0:
		142	return
		143	elif tag in self.tag_required_blacklist:
		144	self.__textrepr += '<' + tag + ' />'
		145	return
		146
		147	if self.__in_dangerous_but_required_tag == 0:
105	if self.__in_dangerous_tag == 0:	148	if self.__in_dangerous_tag == 0:
106	self.__textrepr += self.get_starttag_text()	149	self.__textrepr += self.get_starttag_text()
107		150