summaryrefslogtreecommitdiff
path: root/libmat2
diff options
context:
space:
mode:
Diffstat (limited to 'libmat2')
-rw-r--r--libmat2/web.py38
1 files changed, 23 insertions, 15 deletions
diff --git a/libmat2/web.py b/libmat2/web.py
index 067f5f9..62e7747 100644
--- a/libmat2/web.py
+++ b/libmat2/web.py
@@ -38,8 +38,8 @@ class CSSParser(abstract.AbstractParser):
38 38
39class AbstractHTMLParser(abstract.AbstractParser): 39class AbstractHTMLParser(abstract.AbstractParser):
40 tags_blacklist = set() # type: Set[str] 40 tags_blacklist = set() # type: Set[str]
41 # In some html/xml based formats some tags are mandatory, 41 # In some html/xml-based formats some tags are mandatory,
42 # so we're keeping them, but are discaring their contents 42 # so we're keeping them, but are discaring their content
43 tags_required_blacklist = set() # type: Set[str] 43 tags_required_blacklist = set() # type: Set[str]
44 44
45 def __init__(self, filename): 45 def __init__(self, filename):
@@ -72,6 +72,12 @@ class _HTMLParser(parser.HTMLParser):
72 """Python doesn't have a validating html parser in its stdlib, so 72 """Python doesn't have a validating html parser in its stdlib, so
73 we're using an internal queue to track all the opening/closing tags, 73 we're using an internal queue to track all the opening/closing tags,
74 and hoping for the best. 74 and hoping for the best.
75
76 Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
77 method, so we have to use get_starttag_text instead, put its result in a
78 LIFO, and transform it in a closing tag when needed.
79
80 Also, gotcha: the `tag` parameters are always in lowercase.
75 """ 81 """
76 def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): 82 def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
77 super().__init__() 83 super().__init__()
@@ -79,6 +85,7 @@ class _HTMLParser(parser.HTMLParser):
79 self.__textrepr = '' 85 self.__textrepr = ''
80 self.__meta = {} 86 self.__meta = {}
81 self.__validation_queue = [] # type: List[str] 87 self.__validation_queue = [] # type: List[str]
88
82 # We're using counters instead of booleans, to handle nested tags 89 # We're using counters instead of booleans, to handle nested tags
83 self.__in_dangerous_but_required_tag = 0 90 self.__in_dangerous_but_required_tag = 0
84 self.__in_dangerous_tag = 0 91 self.__in_dangerous_tag = 0
@@ -93,15 +100,16 @@ class _HTMLParser(parser.HTMLParser):
93 original_tag = self.get_starttag_text() 100 original_tag = self.get_starttag_text()
94 self.__validation_queue.append(original_tag) 101 self.__validation_queue.append(original_tag)
95 102
96 if tag in self.tag_required_blacklist:
97 self.__in_dangerous_but_required_tag += 1
98 if tag in self.tag_blacklist: 103 if tag in self.tag_blacklist:
99 self.__in_dangerous_tag += 1 104 self.__in_dangerous_tag += 1
100 105
101 if self.__in_dangerous_tag == 0: 106 if self.__in_dangerous_tag == 0:
102 if self.__in_dangerous_but_required_tag <= 1: 107 if self.__in_dangerous_but_required_tag == 0:
103 self.__textrepr += original_tag 108 self.__textrepr += original_tag
104 109
110 if tag in self.tag_required_blacklist:
111 self.__in_dangerous_but_required_tag += 1
112
105 def handle_endtag(self, tag: str): 113 def handle_endtag(self, tag: str):
106 if not self.__validation_queue: 114 if not self.__validation_queue:
107 raise ValueError("The closing tag %s doesn't have a corresponding " 115 raise ValueError("The closing tag %s doesn't have a corresponding "
@@ -115,14 +123,15 @@ class _HTMLParser(parser.HTMLParser):
115 "tag %s in %s" % 123 "tag %s in %s" %
116 (tag, previous_tag, self.filename)) 124 (tag, previous_tag, self.filename))
117 125
126 if tag in self.tag_required_blacklist:
127 self.__in_dangerous_but_required_tag -= 1
128
118 if self.__in_dangerous_tag == 0: 129 if self.__in_dangerous_tag == 0:
119 if self.__in_dangerous_but_required_tag <= 1: 130 if self.__in_dangerous_but_required_tag == 0:
120 # There is no `get_endtag_text()` method :/ 131 # There is no `get_endtag_text()` method :/
121 self.__textrepr += '</' + previous_tag + '>' 132 self.__textrepr += '</' + previous_tag + '>'
122 133
123 if tag in self.tag_required_blacklist: 134 if tag in self.tag_blacklist:
124 self.__in_dangerous_but_required_tag -= 1
125 elif tag in self.tag_blacklist:
126 self.__in_dangerous_tag -= 1 135 self.__in_dangerous_tag -= 1
127 136
128 def handle_data(self, data: str): 137 def handle_data(self, data: str):
@@ -138,14 +147,13 @@ class _HTMLParser(parser.HTMLParser):
138 content = meta.get('content', 'harmful data') 147 content = meta.get('content', 'harmful data')
139 self.__meta[name] = content 148 self.__meta[name] = content
140 149
141 if self.__in_dangerous_tag != 0: 150 if self.__in_dangerous_tag == 0:
151 if tag in self.tag_required_blacklist:
152 self.__textrepr += '<' + tag + ' />'
142 return 153 return
143 elif tag in self.tag_required_blacklist:
144 self.__textrepr += '<' + tag + ' />'
145 return
146 154
147 if self.__in_dangerous_but_required_tag == 0: 155 if self.__in_dangerous_tag == 0:
148 if self.__in_dangerous_tag == 0: 156 if self.__in_dangerous_but_required_tag == 0:
149 self.__textrepr += self.get_starttag_text() 157 self.__textrepr += self.get_starttag_text()
150 158
151 def remove_all(self, output_filename: str) -> bool: 159 def remove_all(self, output_filename: str) -> bool: