summaryrefslogtreecommitdiff
path: root/libmat2/html.py
diff options
context:
space:
mode:
authorjvoisin2019-02-08 00:26:47 +0100
committerjvoisin2019-02-08 23:05:18 +0100
commit6cc034e81bd0cea98dffe4d7311f3bd16178b63e (patch)
tree319ec5a697a1a1c49089084c09b0a30cbd4983f7 /libmat2/html.py
parente1dd439fc86ba15816e2331e8bed67dd7147e368 (diff)
Add support for html files
Diffstat (limited to 'libmat2/html.py')
-rw-r--r--libmat2/html.py69
1 files changed, 69 insertions, 0 deletions
diff --git a/libmat2/html.py b/libmat2/html.py
new file mode 100644
index 0000000..d0e9a2b
--- /dev/null
+++ b/libmat2/html.py
@@ -0,0 +1,69 @@
1from html import parser
2from typing import Dict, Any, List, Tuple
3
4from . import abstract
5
6
7class HTMLParser(abstract.AbstractParser):
8 mimetypes = {'text/html', }
9 def __init__(self, filename):
10 super().__init__(filename)
11 self.__parser = _HTMLParser()
12 with open(filename) as f:
13 self.__parser.feed(f.read())
14 self.__parser.close()
15
16 def get_meta(self) -> Dict[str, Any]:
17 return self.__parser.get_meta()
18
19 def remove_all(self) -> bool:
20 return self.__parser.remove_all(self.output_filename)
21
22
23class _HTMLParser(parser.HTMLParser):
24 """Python doesn't have a validating html parser in its stdlib, so
25 we're using an internal queue to track all the opening/closing tags,
26 and hoping for the best.
27 """
28 def __init__(self):
29 super().__init__()
30 self.__textrepr = ''
31 self.__meta = {}
32 self.__validation_queue = []
33
34 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
35 self.__textrepr += self.get_starttag_text()
36 self.__validation_queue.append(tag)
37
38 def handle_endtag(self, tag: str):
39 if not self.__validation_queue:
40 raise ValueError
41 elif tag != self.__validation_queue.pop():
42 raise ValueError
43 # There is no `get_endtag_text()` method :/
44 self.__textrepr += '</' + tag + '>\n'
45
46 def handle_data(self, data: str):
47 if data.strip():
48 self.__textrepr += data
49
50 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
51 if tag == 'meta':
52 meta = {k:v for k, v in attrs}
53 name = meta.get('name', 'harmful metadata')
54 content = meta.get('content', 'harmful data')
55 self.__meta[name] = content
56 else:
57 self.__textrepr += self.get_starttag_text()
58
59 def remove_all(self, output_filename: str) -> bool:
60 if self.__validation_queue:
61 raise ValueError
62 with open(output_filename, 'w') as f:
63 f.write(self.__textrepr)
64 return True
65
66 def get_meta(self) -> Dict[str, Any]:
67 if self.__validation_queue:
68 raise ValueError
69 return self.__meta