summaryrefslogtreecommitdiff
path: root/libmat2/web.py
diff options
context:
space:
mode:
authorBrolf2019-02-20 00:45:27 +0100
committergeorg2019-03-05 23:13:42 +0000
commit5ac91cd4f94a822c81bd0bc55a2f7034b31eee7a (patch)
treeb8e4e9ebb15757d5a6779eb5c224a6f98ba01ead /libmat2/web.py
parentc3f097a82ba6fd8549aabc9c5427ab4e337fec0e (diff)
Refactor {black,white}list into {block,allow}list
Closes #96
Diffstat (limited to 'libmat2/web.py')
-rw-r--r--libmat2/web.py36
1 files changed, 18 insertions, 18 deletions
diff --git a/libmat2/web.py b/libmat2/web.py
index 34426b8..0a61908 100644
--- a/libmat2/web.py
+++ b/libmat2/web.py
@@ -37,15 +37,15 @@ class CSSParser(abstract.AbstractParser):
37 37
38 38
39class AbstractHTMLParser(abstract.AbstractParser): 39class AbstractHTMLParser(abstract.AbstractParser):
40 tags_blacklist = set() # type: Set[str] 40 tags_blocklist = set() # type: Set[str]
41 # In some html/xml-based formats some tags are mandatory, 41 # In some html/xml-based formats some tags are mandatory,
42 # so we're keeping them, but are discarding their content 42 # so we're keeping them, but are discarding their content
43 tags_required_blacklist = set() # type: Set[str] 43 tags_required_blocklist = set() # type: Set[str]
44 44
45 def __init__(self, filename): 45 def __init__(self, filename):
46 super().__init__(filename) 46 super().__init__(filename)
47 self.__parser = _HTMLParser(self.filename, self.tags_blacklist, 47 self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
48 self.tags_required_blacklist) 48 self.tags_required_blocklist)
49 with open(filename, encoding='utf-8') as f: 49 with open(filename, encoding='utf-8') as f:
50 self.__parser.feed(f.read()) 50 self.__parser.feed(f.read())
51 self.__parser.close() 51 self.__parser.close()
@@ -59,13 +59,13 @@ class AbstractHTMLParser(abstract.AbstractParser):
59 59
60class HTMLParser(AbstractHTMLParser): 60class HTMLParser(AbstractHTMLParser):
61 mimetypes = {'text/html', } 61 mimetypes = {'text/html', }
62 tags_blacklist = {'meta', } 62 tags_blocklist = {'meta', }
63 tags_required_blacklist = {'title', } 63 tags_required_blocklist = {'title', }
64 64
65 65
66class DTBNCXParser(AbstractHTMLParser): 66class DTBNCXParser(AbstractHTMLParser):
67 mimetypes = {'application/x-dtbncx+xml', } 67 mimetypes = {'application/x-dtbncx+xml', }
68 tags_required_blacklist = {'title', 'doctitle', 'meta'} 68 tags_required_blocklist = {'title', 'doctitle', 'meta'}
69 69
70 70
71class _HTMLParser(parser.HTMLParser): 71class _HTMLParser(parser.HTMLParser):
@@ -79,7 +79,7 @@ class _HTMLParser(parser.HTMLParser):
79 79
80 Also, gotcha: the `tag` parameters are always in lowercase. 80 Also, gotcha: the `tag` parameters are always in lowercase.
81 """ 81 """
82 def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): 82 def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
83 super().__init__() 83 super().__init__()
84 self.filename = filename 84 self.filename = filename
85 self.__textrepr = '' 85 self.__textrepr = ''
@@ -90,24 +90,24 @@ class _HTMLParser(parser.HTMLParser):
90 self.__in_dangerous_but_required_tag = 0 90 self.__in_dangerous_but_required_tag = 0
91 self.__in_dangerous_tag = 0 91 self.__in_dangerous_tag = 0
92 92
93 if required_blacklisted_tags & blacklisted_tags: # pragma: nocover 93 if required_blocklisted_tags & blocklisted_tags: # pragma: nocover
94 raise ValueError("There is an overlap between %s and %s" % ( 94 raise ValueError("There is an overlap between %s and %s" % (
95 required_blacklisted_tags, blacklisted_tags)) 95 required_blocklisted_tags, blocklisted_tags))
96 self.tag_required_blacklist = required_blacklisted_tags 96 self.tag_required_blocklist = required_blocklisted_tags
97 self.tag_blacklist = blacklisted_tags 97 self.tag_blocklist = blocklisted_tags
98 98
99 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): 99 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
100 original_tag = self.get_starttag_text() 100 original_tag = self.get_starttag_text()
101 self.__validation_queue.append(original_tag) 101 self.__validation_queue.append(original_tag)
102 102
103 if tag in self.tag_blacklist: 103 if tag in self.tag_blocklist:
104 self.__in_dangerous_tag += 1 104 self.__in_dangerous_tag += 1
105 105
106 if self.__in_dangerous_tag == 0: 106 if self.__in_dangerous_tag == 0:
107 if self.__in_dangerous_but_required_tag == 0: 107 if self.__in_dangerous_but_required_tag == 0:
108 self.__textrepr += original_tag 108 self.__textrepr += original_tag
109 109
110 if tag in self.tag_required_blacklist: 110 if tag in self.tag_required_blocklist:
111 self.__in_dangerous_but_required_tag += 1 111 self.__in_dangerous_but_required_tag += 1
112 112
113 def handle_endtag(self, tag: str): 113 def handle_endtag(self, tag: str):
@@ -123,7 +123,7 @@ class _HTMLParser(parser.HTMLParser):
123 "tag %s in %s" % 123 "tag %s in %s" %
124 (tag, previous_tag, self.filename)) 124 (tag, previous_tag, self.filename))
125 125
126 if tag in self.tag_required_blacklist: 126 if tag in self.tag_required_blocklist:
127 self.__in_dangerous_but_required_tag -= 1 127 self.__in_dangerous_but_required_tag -= 1
128 128
129 if self.__in_dangerous_tag == 0: 129 if self.__in_dangerous_tag == 0:
@@ -131,7 +131,7 @@ class _HTMLParser(parser.HTMLParser):
131 # There is no `get_endtag_text()` method :/ 131 # There is no `get_endtag_text()` method :/
132 self.__textrepr += '</' + previous_tag + '>' 132 self.__textrepr += '</' + previous_tag + '>'
133 133
134 if tag in self.tag_blacklist: 134 if tag in self.tag_blocklist:
135 self.__in_dangerous_tag -= 1 135 self.__in_dangerous_tag -= 1
136 136
137 def handle_data(self, data: str): 137 def handle_data(self, data: str):
@@ -141,14 +141,14 @@ class _HTMLParser(parser.HTMLParser):
141 self.__textrepr += escape(data) 141 self.__textrepr += escape(data)
142 142
143 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): 143 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
144 if tag in self.tag_required_blacklist | self.tag_blacklist: 144 if tag in self.tag_required_blocklist | self.tag_blocklist:
145 meta = {k:v for k, v in attrs} 145 meta = {k:v for k, v in attrs}
146 name = meta.get('name', 'harmful metadata') 146 name = meta.get('name', 'harmful metadata')
147 content = meta.get('content', 'harmful data') 147 content = meta.get('content', 'harmful data')
148 self.__meta[name] = content 148 self.__meta[name] = content
149 149
150 if self.__in_dangerous_tag == 0: 150 if self.__in_dangerous_tag == 0:
151 if tag in self.tag_required_blacklist: 151 if tag in self.tag_required_blocklist:
152 self.__textrepr += '<' + tag + ' />' 152 self.__textrepr += '<' + tag + ' />'
153 return 153 return
154 154