summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/web.py38
-rw-r--r--tests/test_corrupted_files.py4
-rw-r--r--tests/test_libmat2.py27
3 files changed, 51 insertions, 18 deletions
diff --git a/libmat2/web.py b/libmat2/web.py
index 067f5f9..62e7747 100644
--- a/libmat2/web.py
+++ b/libmat2/web.py
@@ -38,8 +38,8 @@ class CSSParser(abstract.AbstractParser):
38 38
39class AbstractHTMLParser(abstract.AbstractParser): 39class AbstractHTMLParser(abstract.AbstractParser):
40 tags_blacklist = set() # type: Set[str] 40 tags_blacklist = set() # type: Set[str]
41 # In some html/xml based formats some tags are mandatory, 41 # In some html/xml-based formats some tags are mandatory,
42 # so we're keeping them, but are discaring their contents 42 # so we're keeping them, but are discaring their content
43 tags_required_blacklist = set() # type: Set[str] 43 tags_required_blacklist = set() # type: Set[str]
44 44
45 def __init__(self, filename): 45 def __init__(self, filename):
@@ -72,6 +72,12 @@ class _HTMLParser(parser.HTMLParser):
72 """Python doesn't have a validating html parser in its stdlib, so 72 """Python doesn't have a validating html parser in its stdlib, so
73 we're using an internal queue to track all the opening/closing tags, 73 we're using an internal queue to track all the opening/closing tags,
74 and hoping for the best. 74 and hoping for the best.
75
76 Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
77 method, so we have to use get_starttag_text instead, put its result in a
78 LIFO, and transform it in a closing tag when needed.
79
80 Also, gotcha: the `tag` parameters are always in lowercase.
75 """ 81 """
76 def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): 82 def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
77 super().__init__() 83 super().__init__()
@@ -79,6 +85,7 @@ class _HTMLParser(parser.HTMLParser):
79 self.__textrepr = '' 85 self.__textrepr = ''
80 self.__meta = {} 86 self.__meta = {}
81 self.__validation_queue = [] # type: List[str] 87 self.__validation_queue = [] # type: List[str]
88
82 # We're using counters instead of booleans, to handle nested tags 89 # We're using counters instead of booleans, to handle nested tags
83 self.__in_dangerous_but_required_tag = 0 90 self.__in_dangerous_but_required_tag = 0
84 self.__in_dangerous_tag = 0 91 self.__in_dangerous_tag = 0
@@ -93,15 +100,16 @@ class _HTMLParser(parser.HTMLParser):
93 original_tag = self.get_starttag_text() 100 original_tag = self.get_starttag_text()
94 self.__validation_queue.append(original_tag) 101 self.__validation_queue.append(original_tag)
95 102
96 if tag in self.tag_required_blacklist:
97 self.__in_dangerous_but_required_tag += 1
98 if tag in self.tag_blacklist: 103 if tag in self.tag_blacklist:
99 self.__in_dangerous_tag += 1 104 self.__in_dangerous_tag += 1
100 105
101 if self.__in_dangerous_tag == 0: 106 if self.__in_dangerous_tag == 0:
102 if self.__in_dangerous_but_required_tag <= 1: 107 if self.__in_dangerous_but_required_tag == 0:
103 self.__textrepr += original_tag 108 self.__textrepr += original_tag
104 109
110 if tag in self.tag_required_blacklist:
111 self.__in_dangerous_but_required_tag += 1
112
105 def handle_endtag(self, tag: str): 113 def handle_endtag(self, tag: str):
106 if not self.__validation_queue: 114 if not self.__validation_queue:
107 raise ValueError("The closing tag %s doesn't have a corresponding " 115 raise ValueError("The closing tag %s doesn't have a corresponding "
@@ -115,14 +123,15 @@ class _HTMLParser(parser.HTMLParser):
115 "tag %s in %s" % 123 "tag %s in %s" %
116 (tag, previous_tag, self.filename)) 124 (tag, previous_tag, self.filename))
117 125
126 if tag in self.tag_required_blacklist:
127 self.__in_dangerous_but_required_tag -= 1
128
118 if self.__in_dangerous_tag == 0: 129 if self.__in_dangerous_tag == 0:
119 if self.__in_dangerous_but_required_tag <= 1: 130 if self.__in_dangerous_but_required_tag == 0:
120 # There is no `get_endtag_text()` method :/ 131 # There is no `get_endtag_text()` method :/
121 self.__textrepr += '</' + previous_tag + '>' 132 self.__textrepr += '</' + previous_tag + '>'
122 133
123 if tag in self.tag_required_blacklist: 134 if tag in self.tag_blacklist:
124 self.__in_dangerous_but_required_tag -= 1
125 elif tag in self.tag_blacklist:
126 self.__in_dangerous_tag -= 1 135 self.__in_dangerous_tag -= 1
127 136
128 def handle_data(self, data: str): 137 def handle_data(self, data: str):
@@ -138,14 +147,13 @@ class _HTMLParser(parser.HTMLParser):
138 content = meta.get('content', 'harmful data') 147 content = meta.get('content', 'harmful data')
139 self.__meta[name] = content 148 self.__meta[name] = content
140 149
141 if self.__in_dangerous_tag != 0: 150 if self.__in_dangerous_tag == 0:
151 if tag in self.tag_required_blacklist:
152 self.__textrepr += '<' + tag + ' />'
142 return 153 return
143 elif tag in self.tag_required_blacklist:
144 self.__textrepr += '<' + tag + ' />'
145 return
146 154
147 if self.__in_dangerous_but_required_tag == 0: 155 if self.__in_dangerous_tag == 0:
148 if self.__in_dangerous_tag == 0: 156 if self.__in_dangerous_but_required_tag == 0:
149 self.__textrepr += self.get_starttag_text() 157 self.__textrepr += self.get_starttag_text()
150 158
151 def remove_all(self, output_filename: str) -> bool: 159 def remove_all(self, output_filename: str) -> bool:
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index b2cec00..4a16d51 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -269,9 +269,6 @@ class TestCorruptedFiles(unittest.TestCase):
269 os.remove('./tests/data/clean.html') 269 os.remove('./tests/data/clean.html')
270 270
271 with open('./tests/data/clean.html', 'w') as f: 271 with open('./tests/data/clean.html', 'w') as f:
272 f.write('<meta><meta/></meta>')
273 f.write('<title><title>pouet</title></title>')
274 f.write('<title><mysupertag/></title>')
275 f.write('<doctitle><br/></doctitle><br/><notclosed>') 272 f.write('<doctitle><br/></doctitle><br/><notclosed>')
276 p = web.HTMLParser('./tests/data/clean.html') 273 p = web.HTMLParser('./tests/data/clean.html')
277 with self.assertRaises(ValueError): 274 with self.assertRaises(ValueError):
@@ -281,6 +278,7 @@ class TestCorruptedFiles(unittest.TestCase):
281 p.remove_all() 278 p.remove_all()
282 os.remove('./tests/data/clean.html') 279 os.remove('./tests/data/clean.html')
283 280
281
284 def test_epub(self): 282 def test_epub(self):
285 with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout: 283 with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
286 zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf') 284 zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index f4b1890..46e234e 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -633,6 +633,33 @@ class TestCleaning(unittest.TestCase):
633 os.remove('./tests/data/clean.cleaned.html') 633 os.remove('./tests/data/clean.cleaned.html')
634 os.remove('./tests/data/clean.cleaned.cleaned.html') 634 os.remove('./tests/data/clean.cleaned.cleaned.html')
635 635
636 with open('./tests/data/clean.html', 'w') as f:
637 f.write('<title><title><pouet/><meta/></title></title><test/>')
638 p = web.HTMLParser('./tests/data/clean.html')
639 self.assertTrue(p.remove_all())
640 with open('./tests/data/clean.cleaned.html', 'r') as f:
641 self.assertEqual(f.read(), '<title></title><test/>')
642 os.remove('./tests/data/clean.html')
643 os.remove('./tests/data/clean.cleaned.html')
644
645 with open('./tests/data/clean.html', 'w') as f:
646 f.write('<test><title>Some<b>metadata</b><br/></title></test>')
647 p = web.HTMLParser('./tests/data/clean.html')
648 self.assertTrue(p.remove_all())
649 with open('./tests/data/clean.cleaned.html', 'r') as f:
650 self.assertEqual(f.read(), '<test><title></title></test>')
651 os.remove('./tests/data/clean.html')
652 os.remove('./tests/data/clean.cleaned.html')
653
654 with open('./tests/data/clean.html', 'w') as f:
655 f.write('<meta><meta/></meta>')
656 p = web.HTMLParser('./tests/data/clean.html')
657 self.assertTrue(p.remove_all())
658 with open('./tests/data/clean.cleaned.html', 'r') as f:
659 self.assertEqual(f.read(), '')
660 os.remove('./tests/data/clean.html')
661 os.remove('./tests/data/clean.cleaned.html')
662
636 663
637 def test_epub(self): 664 def test_epub(self):
638 shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub') 665 shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')