Improve the previous commit

- More tests - More documentation - Minor code cleanup
author: jvoisin 2019-02-27 23:53:07 +0100
committer: jvoisin 2019-02-27 23:53:07 +0100
commit: 55214206b5f0f1c12fec378967f89ef4cb5674f9 (patch)
tree: 82c94503dfd1d844b4c53d9083c69483d9d4f68d
parent: 73d2966e8c10eb6c083a2abacc53f3297d16376e (diff)
3 files changed, 51 insertions, 18 deletions
diff --git a/libmat2/web.py b/libmat2/web.py
index 067f5f9..62e7747 100644
--- a/libmat2/web.py
+++ b/libmat2/web.py
@@ -38,8 +38,8 @@ class CSSParser(abstract.AbstractParser):
 class AbstractHTMLParser(abstract.AbstractParser):
    tags_blacklist = set()  # type: Set[str]
-    # In some html/xml based formats some tags are mandatory,
+    # In some html/xml-based formats some tags are mandatory,
-    # so we're keeping them, but are discaring their contents
+    # so we're keeping them, but are discaring their content
    tags_required_blacklist = set()  # type: Set[str]
    def __init__(self, filename):
@@ -72,6 +72,12 @@ class _HTMLParser(parser.HTMLParser):
    """Python doesn't have a validating html parser in its stdlib, so
    we're using an internal queue to track all the opening/closing tags,
    and hoping for the best.
+    Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
+    method, so we have to use get_starttag_text instead, put its result in a
+    LIFO, and transform it in a closing tag when needed.
+    Also, gotcha: the `tag` parameters are always in lowercase.
    """
    def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
        super().__init__()
@@ -79,6 +85,7 @@ class _HTMLParser(parser.HTMLParser):
        self.__textrepr = ''
        self.__meta = {}
        self.__validation_queue = []  # type: List[str]
        # We're using counters instead of booleans, to handle nested tags
        self.__in_dangerous_but_required_tag = 0
        self.__in_dangerous_tag = 0
@@ -93,15 +100,16 @@ class _HTMLParser(parser.HTMLParser):
        original_tag = self.get_starttag_text()
        self.__validation_queue.append(original_tag)
-        if tag in self.tag_required_blacklist:
-            self.__in_dangerous_but_required_tag += 1
        if tag in self.tag_blacklist:
            self.__in_dangerous_tag += 1
        if self.__in_dangerous_tag == 0:
-            if self.__in_dangerous_but_required_tag <= 1:
+            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += original_tag
+        if tag in self.tag_required_blacklist:
+            self.__in_dangerous_but_required_tag += 1
    def handle_endtag(self, tag: str):
        if not self.__validation_queue:
            raise ValueError("The closing tag %s doesn't have a corresponding "
@@ -115,14 +123,15 @@ class _HTMLParser(parser.HTMLParser):
                             "tag %s in %s" %
                             (tag, previous_tag, self.filename))
+        if tag in self.tag_required_blacklist:
+            self.__in_dangerous_but_required_tag -= 1
        if self.__in_dangerous_tag == 0:
-            if self.__in_dangerous_but_required_tag <= 1:
+            if self.__in_dangerous_but_required_tag == 0:
                # There is no `get_endtag_text()` method :/
                self.__textrepr += '</' + previous_tag + '>'
-        if tag in self.tag_required_blacklist:
+        if tag in self.tag_blacklist:
-            self.__in_dangerous_but_required_tag -= 1
-        elif tag in self.tag_blacklist:
            self.__in_dangerous_tag -= 1
    def handle_data(self, data: str):
@@ -138,14 +147,13 @@ class _HTMLParser(parser.HTMLParser):
            content = meta.get('content', 'harmful data')
            self.__meta[name] = content
-            if self.__in_dangerous_tag != 0:
+            if self.__in_dangerous_tag == 0:
+                if tag in self.tag_required_blacklist:
+                    self.__textrepr += '<' + tag + ' />'
                return
-            elif tag in self.tag_required_blacklist:
-                self.__textrepr += '<' + tag + ' />'
-            return
-        if self.__in_dangerous_but_required_tag == 0:
+        if self.__in_dangerous_tag == 0:
-            if self.__in_dangerous_tag == 0:
+            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += self.get_starttag_text()
    def remove_all(self, output_filename: str) -> bool:
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index b2cec00..4a16d51 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -269,9 +269,6 @@ class TestCorruptedFiles(unittest.TestCase):
        os.remove('./tests/data/clean.html')
        with open('./tests/data/clean.html', 'w') as f:
-            f.write('<meta><meta/></meta>')
-            f.write('<title><title>pouet</title></title>')
-            f.write('<title><mysupertag/></title>')
            f.write('<doctitle><br/></doctitle><br/><notclosed>')
        p = web.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
@@ -281,6 +278,7 @@ class TestCorruptedFiles(unittest.TestCase):
            p.remove_all()
        os.remove('./tests/data/clean.html')
    def test_epub(self):
        with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
            zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index f4b1890..46e234e 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -633,6 +633,33 @@ class TestCleaning(unittest.TestCase):
        os.remove('./tests/data/clean.cleaned.html')
        os.remove('./tests/data/clean.cleaned.cleaned.html')
+        with open('./tests/data/clean.html', 'w') as f:
+            f.write('<title><title><pouet/><meta/></title></title><test/>')
+        p = web.HTMLParser('./tests/data/clean.html')
+        self.assertTrue(p.remove_all())
+        with open('./tests/data/clean.cleaned.html', 'r') as f:
+            self.assertEqual(f.read(), '<title></title><test/>')
+        os.remove('./tests/data/clean.html')
+        os.remove('./tests/data/clean.cleaned.html')
+        with open('./tests/data/clean.html', 'w') as f:
+            f.write('<test><title>Some<b>metadata</b><br/></title></test>')
+        p = web.HTMLParser('./tests/data/clean.html')
+        self.assertTrue(p.remove_all())
+        with open('./tests/data/clean.cleaned.html', 'r') as f:
+            self.assertEqual(f.read(), '<test><title></title></test>')
+        os.remove('./tests/data/clean.html')
+        os.remove('./tests/data/clean.cleaned.html')
+        with open('./tests/data/clean.html', 'w') as f:
+            f.write('<meta><meta/></meta>')
+        p = web.HTMLParser('./tests/data/clean.html')
+        self.assertTrue(p.remove_all())
+        with open('./tests/data/clean.cleaned.html', 'r') as f:
+            self.assertEqual(f.read(), '')
+        os.remove('./tests/data/clean.html')
+        os.remove('./tests/data/clean.cleaned.html')
    def test_epub(self):
        shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
author	jvoisin	2019-02-27 23:53:07 +0100
committer	jvoisin	2019-02-27 23:53:07 +0100
commit	55214206b5f0f1c12fec378967f89ef4cb5674f9 (patch)
tree	82c94503dfd1d844b4c53d9083c69483d9d4f68d
parent	73d2966e8c10eb6c083a2abacc53f3297d16376e (diff)