diff options
| author | Brolf | 2019-02-20 00:45:27 +0100 |
|---|---|---|
| committer | georg | 2019-03-05 23:13:42 +0000 |
| commit | 5ac91cd4f94a822c81bd0bc55a2f7034b31eee7a (patch) | |
| tree | b8e4e9ebb15757d5a6779eb5c224a6f98ba01ead /libmat2/web.py | |
| parent | c3f097a82ba6fd8549aabc9c5427ab4e337fec0e (diff) | |
Refactor {black,white}list into {block,allow}list
Closes #96
Diffstat (limited to 'libmat2/web.py')
| -rw-r--r-- | libmat2/web.py | 36 |
1 files changed, 18 insertions, 18 deletions
diff --git a/libmat2/web.py b/libmat2/web.py index 34426b8..0a61908 100644 --- a/libmat2/web.py +++ b/libmat2/web.py | |||
| @@ -37,15 +37,15 @@ class CSSParser(abstract.AbstractParser): | |||
| 37 | 37 | ||
| 38 | 38 | ||
| 39 | class AbstractHTMLParser(abstract.AbstractParser): | 39 | class AbstractHTMLParser(abstract.AbstractParser): |
| 40 | tags_blacklist = set() # type: Set[str] | 40 | tags_blocklist = set() # type: Set[str] |
| 41 | # In some html/xml-based formats some tags are mandatory, | 41 | # In some html/xml-based formats some tags are mandatory, |
| 42 | # so we're keeping them, but are discarding their content | 42 | # so we're keeping them, but are discarding their content |
| 43 | tags_required_blacklist = set() # type: Set[str] | 43 | tags_required_blocklist = set() # type: Set[str] |
| 44 | 44 | ||
| 45 | def __init__(self, filename): | 45 | def __init__(self, filename): |
| 46 | super().__init__(filename) | 46 | super().__init__(filename) |
| 47 | self.__parser = _HTMLParser(self.filename, self.tags_blacklist, | 47 | self.__parser = _HTMLParser(self.filename, self.tags_blocklist, |
| 48 | self.tags_required_blacklist) | 48 | self.tags_required_blocklist) |
| 49 | with open(filename, encoding='utf-8') as f: | 49 | with open(filename, encoding='utf-8') as f: |
| 50 | self.__parser.feed(f.read()) | 50 | self.__parser.feed(f.read()) |
| 51 | self.__parser.close() | 51 | self.__parser.close() |
| @@ -59,13 +59,13 @@ class AbstractHTMLParser(abstract.AbstractParser): | |||
| 59 | 59 | ||
| 60 | class HTMLParser(AbstractHTMLParser): | 60 | class HTMLParser(AbstractHTMLParser): |
| 61 | mimetypes = {'text/html', } | 61 | mimetypes = {'text/html', } |
| 62 | tags_blacklist = {'meta', } | 62 | tags_blocklist = {'meta', } |
| 63 | tags_required_blacklist = {'title', } | 63 | tags_required_blocklist = {'title', } |
| 64 | 64 | ||
| 65 | 65 | ||
| 66 | class DTBNCXParser(AbstractHTMLParser): | 66 | class DTBNCXParser(AbstractHTMLParser): |
| 67 | mimetypes = {'application/x-dtbncx+xml', } | 67 | mimetypes = {'application/x-dtbncx+xml', } |
| 68 | tags_required_blacklist = {'title', 'doctitle', 'meta'} | 68 | tags_required_blocklist = {'title', 'doctitle', 'meta'} |
| 69 | 69 | ||
| 70 | 70 | ||
| 71 | class _HTMLParser(parser.HTMLParser): | 71 | class _HTMLParser(parser.HTMLParser): |
| @@ -79,7 +79,7 @@ class _HTMLParser(parser.HTMLParser): | |||
| 79 | 79 | ||
| 80 | Also, gotcha: the `tag` parameters are always in lowercase. | 80 | Also, gotcha: the `tag` parameters are always in lowercase. |
| 81 | """ | 81 | """ |
| 82 | def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): | 82 | def __init__(self, filename, blocklisted_tags, required_blocklisted_tags): |
| 83 | super().__init__() | 83 | super().__init__() |
| 84 | self.filename = filename | 84 | self.filename = filename |
| 85 | self.__textrepr = '' | 85 | self.__textrepr = '' |
| @@ -90,24 +90,24 @@ class _HTMLParser(parser.HTMLParser): | |||
| 90 | self.__in_dangerous_but_required_tag = 0 | 90 | self.__in_dangerous_but_required_tag = 0 |
| 91 | self.__in_dangerous_tag = 0 | 91 | self.__in_dangerous_tag = 0 |
| 92 | 92 | ||
| 93 | if required_blacklisted_tags & blacklisted_tags: # pragma: nocover | 93 | if required_blocklisted_tags & blocklisted_tags: # pragma: nocover |
| 94 | raise ValueError("There is an overlap between %s and %s" % ( | 94 | raise ValueError("There is an overlap between %s and %s" % ( |
| 95 | required_blacklisted_tags, blacklisted_tags)) | 95 | required_blocklisted_tags, blocklisted_tags)) |
| 96 | self.tag_required_blacklist = required_blacklisted_tags | 96 | self.tag_required_blocklist = required_blocklisted_tags |
| 97 | self.tag_blacklist = blacklisted_tags | 97 | self.tag_blocklist = blocklisted_tags |
| 98 | 98 | ||
| 99 | def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): | 99 | def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): |
| 100 | original_tag = self.get_starttag_text() | 100 | original_tag = self.get_starttag_text() |
| 101 | self.__validation_queue.append(original_tag) | 101 | self.__validation_queue.append(original_tag) |
| 102 | 102 | ||
| 103 | if tag in self.tag_blacklist: | 103 | if tag in self.tag_blocklist: |
| 104 | self.__in_dangerous_tag += 1 | 104 | self.__in_dangerous_tag += 1 |
| 105 | 105 | ||
| 106 | if self.__in_dangerous_tag == 0: | 106 | if self.__in_dangerous_tag == 0: |
| 107 | if self.__in_dangerous_but_required_tag == 0: | 107 | if self.__in_dangerous_but_required_tag == 0: |
| 108 | self.__textrepr += original_tag | 108 | self.__textrepr += original_tag |
| 109 | 109 | ||
| 110 | if tag in self.tag_required_blacklist: | 110 | if tag in self.tag_required_blocklist: |
| 111 | self.__in_dangerous_but_required_tag += 1 | 111 | self.__in_dangerous_but_required_tag += 1 |
| 112 | 112 | ||
| 113 | def handle_endtag(self, tag: str): | 113 | def handle_endtag(self, tag: str): |
| @@ -123,7 +123,7 @@ class _HTMLParser(parser.HTMLParser): | |||
| 123 | "tag %s in %s" % | 123 | "tag %s in %s" % |
| 124 | (tag, previous_tag, self.filename)) | 124 | (tag, previous_tag, self.filename)) |
| 125 | 125 | ||
| 126 | if tag in self.tag_required_blacklist: | 126 | if tag in self.tag_required_blocklist: |
| 127 | self.__in_dangerous_but_required_tag -= 1 | 127 | self.__in_dangerous_but_required_tag -= 1 |
| 128 | 128 | ||
| 129 | if self.__in_dangerous_tag == 0: | 129 | if self.__in_dangerous_tag == 0: |
| @@ -131,7 +131,7 @@ class _HTMLParser(parser.HTMLParser): | |||
| 131 | # There is no `get_endtag_text()` method :/ | 131 | # There is no `get_endtag_text()` method :/ |
| 132 | self.__textrepr += '</' + previous_tag + '>' | 132 | self.__textrepr += '</' + previous_tag + '>' |
| 133 | 133 | ||
| 134 | if tag in self.tag_blacklist: | 134 | if tag in self.tag_blocklist: |
| 135 | self.__in_dangerous_tag -= 1 | 135 | self.__in_dangerous_tag -= 1 |
| 136 | 136 | ||
| 137 | def handle_data(self, data: str): | 137 | def handle_data(self, data: str): |
| @@ -141,14 +141,14 @@ class _HTMLParser(parser.HTMLParser): | |||
| 141 | self.__textrepr += escape(data) | 141 | self.__textrepr += escape(data) |
| 142 | 142 | ||
| 143 | def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): | 143 | def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): |
| 144 | if tag in self.tag_required_blacklist | self.tag_blacklist: | 144 | if tag in self.tag_required_blocklist | self.tag_blocklist: |
| 145 | meta = {k:v for k, v in attrs} | 145 | meta = {k:v for k, v in attrs} |
| 146 | name = meta.get('name', 'harmful metadata') | 146 | name = meta.get('name', 'harmful metadata') |
| 147 | content = meta.get('content', 'harmful data') | 147 | content = meta.get('content', 'harmful data') |
| 148 | self.__meta[name] = content | 148 | self.__meta[name] = content |
| 149 | 149 | ||
| 150 | if self.__in_dangerous_tag == 0: | 150 | if self.__in_dangerous_tag == 0: |
| 151 | if tag in self.tag_required_blacklist: | 151 | if tag in self.tag_required_blocklist: |
| 152 | self.__textrepr += '<' + tag + ' />' | 152 | self.__textrepr += '<' + tag + ' />' |
| 153 | return | 153 | return |
| 154 | 154 | ||
