diff options
Diffstat (limited to 'libmat2/web.py')
| -rw-r--r-- | libmat2/web.py | 87 |
1 files changed, 65 insertions, 22 deletions
diff --git a/libmat2/web.py b/libmat2/web.py index c11b47d..067f5f9 100644 --- a/libmat2/web.py +++ b/libmat2/web.py | |||
| @@ -1,10 +1,13 @@ | |||
| 1 | from html import parser | 1 | from html import parser, escape |
| 2 | from typing import Dict, Any, List, Tuple | 2 | from typing import Dict, Any, List, Tuple, Set |
| 3 | import re | 3 | import re |
| 4 | import string | 4 | import string |
| 5 | 5 | ||
| 6 | from . import abstract | 6 | from . import abstract |
| 7 | 7 | ||
| 8 | assert Set | ||
| 9 | |||
| 10 | # pylint: disable=too-many-instance-attributes | ||
| 8 | 11 | ||
| 9 | class CSSParser(abstract.AbstractParser): | 12 | class CSSParser(abstract.AbstractParser): |
| 10 | """There is no such things as metadata in CSS files, | 13 | """There is no such things as metadata in CSS files, |
| @@ -33,11 +36,16 @@ class CSSParser(abstract.AbstractParser): | |||
| 33 | return metadata | 36 | return metadata |
| 34 | 37 | ||
| 35 | 38 | ||
| 36 | class HTMLParser(abstract.AbstractParser): | 39 | class AbstractHTMLParser(abstract.AbstractParser): |
| 37 | mimetypes = {'text/html', 'application/x-dtbncx+xml', } | 40 | tags_blacklist = set() # type: Set[str] |
| 41 | # In some html/xml based formats some tags are mandatory, | ||
| 42 | # so we're keeping them, but are discaring their contents | ||
| 43 | tags_required_blacklist = set() # type: Set[str] | ||
| 44 | |||
| 38 | def __init__(self, filename): | 45 | def __init__(self, filename): |
| 39 | super().__init__(filename) | 46 | super().__init__(filename) |
| 40 | self.__parser = _HTMLParser(self.filename) | 47 | self.__parser = _HTMLParser(self.filename, self.tags_blacklist, |
| 48 | self.tags_required_blacklist) | ||
| 41 | with open(filename, encoding='utf-8') as f: | 49 | with open(filename, encoding='utf-8') as f: |
| 42 | self.__parser.feed(f.read()) | 50 | self.__parser.feed(f.read()) |
| 43 | self.__parser.close() | 51 | self.__parser.close() |
| @@ -49,29 +57,50 @@ class HTMLParser(abstract.AbstractParser): | |||
| 49 | return self.__parser.remove_all(self.output_filename) | 57 | return self.__parser.remove_all(self.output_filename) |
| 50 | 58 | ||
| 51 | 59 | ||
| 60 | class HTMLParser(AbstractHTMLParser): | ||
| 61 | mimetypes = {'text/html', } | ||
| 62 | tags_blacklist = {'meta', } | ||
| 63 | tags_required_blacklist = {'title', } | ||
| 64 | |||
| 65 | |||
| 66 | class DTBNCXParser(AbstractHTMLParser): | ||
| 67 | mimetypes = {'application/x-dtbncx+xml', } | ||
| 68 | tags_required_blacklist = {'title', 'doctitle', 'meta'} | ||
| 69 | |||
| 70 | |||
| 52 | class _HTMLParser(parser.HTMLParser): | 71 | class _HTMLParser(parser.HTMLParser): |
| 53 | """Python doesn't have a validating html parser in its stdlib, so | 72 | """Python doesn't have a validating html parser in its stdlib, so |
| 54 | we're using an internal queue to track all the opening/closing tags, | 73 | we're using an internal queue to track all the opening/closing tags, |
| 55 | and hoping for the best. | 74 | and hoping for the best. |
| 56 | """ | 75 | """ |
| 57 | tag_blacklist = {'doctitle', 'meta', 'title'} # everything is lowercase | 76 | def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): |
| 58 | def __init__(self, filename): | ||
| 59 | super().__init__() | 77 | super().__init__() |
| 60 | self.filename = filename | 78 | self.filename = filename |
| 61 | self.__textrepr = '' | 79 | self.__textrepr = '' |
| 62 | self.__meta = {} | 80 | self.__meta = {} |
| 63 | self.__validation_queue = [] | 81 | self.__validation_queue = [] # type: List[str] |
| 64 | # We're using a counter instead of a boolean to handle nested tags | 82 | # We're using counters instead of booleans, to handle nested tags |
| 83 | self.__in_dangerous_but_required_tag = 0 | ||
| 65 | self.__in_dangerous_tag = 0 | 84 | self.__in_dangerous_tag = 0 |
| 66 | 85 | ||
| 86 | if required_blacklisted_tags & blacklisted_tags: # pragma: nocover | ||
| 87 | raise ValueError("There is an overlap between %s and %s" % ( | ||
| 88 | required_blacklisted_tags, blacklisted_tags)) | ||
| 89 | self.tag_required_blacklist = required_blacklisted_tags | ||
| 90 | self.tag_blacklist = blacklisted_tags | ||
| 91 | |||
| 67 | def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): | 92 | def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): |
| 68 | self.__validation_queue.append(tag) | 93 | original_tag = self.get_starttag_text() |
| 94 | self.__validation_queue.append(original_tag) | ||
| 95 | |||
| 96 | if tag in self.tag_required_blacklist: | ||
| 97 | self.__in_dangerous_but_required_tag += 1 | ||
| 69 | if tag in self.tag_blacklist: | 98 | if tag in self.tag_blacklist: |
| 70 | self.__in_dangerous_tag += 1 | 99 | self.__in_dangerous_tag += 1 |
| 71 | return | ||
| 72 | 100 | ||
| 73 | if self.__in_dangerous_tag == 0: | 101 | if self.__in_dangerous_tag == 0: |
| 74 | self.__textrepr += self.get_starttag_text() | 102 | if self.__in_dangerous_but_required_tag <= 1: |
| 103 | self.__textrepr += original_tag | ||
| 75 | 104 | ||
| 76 | def handle_endtag(self, tag: str): | 105 | def handle_endtag(self, tag: str): |
| 77 | if not self.__validation_queue: | 106 | if not self.__validation_queue: |
| @@ -79,29 +108,43 @@ class _HTMLParser(parser.HTMLParser): | |||
| 79 | "opening one in %s." % (tag, self.filename)) | 108 | "opening one in %s." % (tag, self.filename)) |
| 80 | 109 | ||
| 81 | previous_tag = self.__validation_queue.pop() | 110 | previous_tag = self.__validation_queue.pop() |
| 82 | if tag != previous_tag: | 111 | previous_tag = previous_tag[1:-1] # remove < and > |
| 112 | previous_tag = previous_tag.split(' ')[0] # remove attributes | ||
| 113 | if tag != previous_tag.lower(): | ||
| 83 | raise ValueError("The closing tag %s doesn't match the previous " | 114 | raise ValueError("The closing tag %s doesn't match the previous " |
| 84 | "tag %s in %s" % | 115 | "tag %s in %s" % |
| 85 | (tag, previous_tag, self.filename)) | 116 | (tag, previous_tag, self.filename)) |
| 86 | elif tag in self.tag_blacklist: | ||
| 87 | self.__in_dangerous_tag -= 1 | ||
| 88 | return | ||
| 89 | 117 | ||
| 90 | if self.__in_dangerous_tag == 0: | 118 | if self.__in_dangerous_tag == 0: |
| 91 | # There is no `get_endtag_text()` method :/ | 119 | if self.__in_dangerous_but_required_tag <= 1: |
| 92 | self.__textrepr += '</' + tag + '>\n' | 120 | # There is no `get_endtag_text()` method :/ |
| 121 | self.__textrepr += '</' + previous_tag + '>' | ||
| 122 | |||
| 123 | if tag in self.tag_required_blacklist: | ||
| 124 | self.__in_dangerous_but_required_tag -= 1 | ||
| 125 | elif tag in self.tag_blacklist: | ||
| 126 | self.__in_dangerous_tag -= 1 | ||
| 93 | 127 | ||
| 94 | def handle_data(self, data: str): | 128 | def handle_data(self, data: str): |
| 95 | if self.__in_dangerous_tag == 0 and data.strip(): | 129 | if self.__in_dangerous_but_required_tag == 0: |
| 96 | self.__textrepr += data | 130 | if self.__in_dangerous_tag == 0: |
| 131 | if data.strip(): | ||
| 132 | self.__textrepr += escape(data) | ||
| 97 | 133 | ||
| 98 | def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): | 134 | def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): |
| 99 | if tag in self.tag_blacklist: | 135 | if tag in self.tag_required_blacklist | self.tag_blacklist: |
| 100 | meta = {k:v for k, v in attrs} | 136 | meta = {k:v for k, v in attrs} |
| 101 | name = meta.get('name', 'harmful metadata') | 137 | name = meta.get('name', 'harmful metadata') |
| 102 | content = meta.get('content', 'harmful data') | 138 | content = meta.get('content', 'harmful data') |
| 103 | self.__meta[name] = content | 139 | self.__meta[name] = content |
| 104 | else: | 140 | |
| 141 | if self.__in_dangerous_tag != 0: | ||
| 142 | return | ||
| 143 | elif tag in self.tag_required_blacklist: | ||
| 144 | self.__textrepr += '<' + tag + ' />' | ||
| 145 | return | ||
| 146 | |||
| 147 | if self.__in_dangerous_but_required_tag == 0: | ||
| 105 | if self.__in_dangerous_tag == 0: | 148 | if self.__in_dangerous_tag == 0: |
| 106 | self.__textrepr += self.get_starttag_text() | 149 | self.__textrepr += self.get_starttag_text() |
| 107 | 150 | ||
