libmat2/web.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

from html import parser, escape
from typing import Dict, Any, List, Tuple, Set
import re
import string

from . import abstract

assert Set

# pylint: disable=too-many-instance-attributes

class CSSParser(abstract.AbstractParser):
    """There is no such things as metadata in CSS files,
    only comments of the form `/* … */`, so we're removing the laters."""
    mimetypes = {'text/css', }
    flags = re.MULTILINE | re.DOTALL

    def remove_all(self) -> bool:
        with open(self.filename, encoding='utf-8') as f:
            cleaned = re.sub(r'/\*.*?\*/', '', f.read(), 0, self.flags)
        with open(self.output_filename, 'w', encoding='utf-8') as f:
            f.write(cleaned)
        return True

    def get_meta(self) -> Dict[str, Any]:
        metadata = {}
        with open(self.filename, encoding='utf-8') as f:
            cssdoc = re.findall(r'/\*(.*?)\*/', f.read(), self.flags)
        for match in cssdoc:
            for line in match.splitlines():
                try:
                    k, v = line.split(':')
                    metadata[k.strip(string.whitespace + '*')] = v.strip()
                except ValueError:
                    metadata['harmful data'] = line.strip()
        return metadata


class AbstractHTMLParser(abstract.AbstractParser):
    tags_blacklist = set()  # type: Set[str]
    # In some html/xml-based formats some tags are mandatory,
    # so we're keeping them, but are discarding their content
    tags_required_blacklist = set()  # type: Set[str]

    def __init__(self, filename):
        super().__init__(filename)
        self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
                                    self.tags_required_blacklist)
        with open(filename, encoding='utf-8') as f:
            self.__parser.feed(f.read())
        self.__parser.close()

    def get_meta(self) -> Dict[str, Any]:
        return self.__parser.get_meta()

    def remove_all(self) -> bool:
        return self.__parser.remove_all(self.output_filename)


class HTMLParser(AbstractHTMLParser):
    mimetypes = {'text/html', }
    tags_blacklist = {'meta', }
    tags_required_blacklist = {'title', }


class DTBNCXParser(AbstractHTMLParser):
    mimetypes = {'application/x-dtbncx+xml', }
    tags_required_blacklist = {'title', 'doctitle', 'meta'}


class _HTMLParser(parser.HTMLParser):
    """Python doesn't have a validating html parser in its stdlib, so
    we're using an internal queue to track all the opening/closing tags,
    and hoping for the best.

    Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
    method, so we have to use get_starttag_text instead, put its result in a
    LIFO, and transform it in a closing tag when needed.

    Also, gotcha: the `tag` parameters are always in lowercase.
    """
    def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
        super().__init__()
        self.filename = filename
        self.__textrepr = ''
        self.__meta = {}
        self.__validation_queue = []  # type: List[str]

        # We're using counters instead of booleans, to handle nested tags
        self.__in_dangerous_but_required_tag = 0
        self.__in_dangerous_tag = 0

        if required_blacklisted_tags & blacklisted_tags:  # pragma: nocover
            raise ValueError("There is an overlap between %s and %s" % (
                required_blacklisted_tags, blacklisted_tags))
        self.tag_required_blacklist = required_blacklisted_tags
        self.tag_blacklist = blacklisted_tags

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
        original_tag = self.get_starttag_text()
        self.__validation_queue.append(original_tag)

        if tag in self.tag_blacklist:
            self.__in_dangerous_tag += 1

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += original_tag

        if tag in self.tag_required_blacklist:
            self.__in_dangerous_but_required_tag += 1

    def handle_endtag(self, tag: str):
        if not self.__validation_queue:
            raise ValueError("The closing tag %s doesn't have a corresponding "
                             "opening one in %s." % (tag, self.filename))

        previous_tag = self.__validation_queue.pop()
        previous_tag = previous_tag[1:-1]  # remove < and >
        previous_tag = previous_tag.split(' ')[0]  # remove attributes
        if tag != previous_tag.lower():
            raise ValueError("The closing tag %s doesn't match the previous "
                             "tag %s in %s" %
                             (tag, previous_tag, self.filename))

        if tag in self.tag_required_blacklist:
            self.__in_dangerous_but_required_tag -= 1

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                # There is no `get_endtag_text()` method :/
                self.__textrepr += '</' + previous_tag + '>'

        if tag in self.tag_blacklist:
            self.__in_dangerous_tag -= 1

    def handle_data(self, data: str):
        if self.__in_dangerous_but_required_tag == 0:
            if self.__in_dangerous_tag == 0:
                if data.strip():
                    self.__textrepr += escape(data)

    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
        if tag in self.tag_required_blacklist | self.tag_blacklist:
            meta = {k:v for k, v in attrs}
            name = meta.get('name', 'harmful metadata')
            content = meta.get('content', 'harmful data')
            self.__meta[name] = content

            if self.__in_dangerous_tag == 0:
                if tag in self.tag_required_blacklist:
                    self.__textrepr += '<' + tag + ' />'
                return

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += self.get_starttag_text()

    def remove_all(self, output_filename: str) -> bool:
        if self.__validation_queue:
            raise ValueError("Some tags (%s) were left unclosed in %s" % (
                ', '.join(self.__validation_queue),
                self.filename))
        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write(self.__textrepr)
        return True

    def get_meta(self) -> Dict[str, Any]:
        if self.__validation_queue:
            raise ValueError("Some tags (%s) were left unclosed in %s" % (
                ', '.join(self.__validation_queue),
                self.filename))
        return self.__meta