summaryrefslogtreecommitdiff
path: root/libmat2/web.py
diff options
context:
space:
mode:
Diffstat (limited to 'libmat2/web.py')
-rw-r--r--libmat2/web.py122
1 files changed, 122 insertions, 0 deletions
diff --git a/libmat2/web.py b/libmat2/web.py
new file mode 100644
index 0000000..13d5fc8
--- /dev/null
+++ b/libmat2/web.py
@@ -0,0 +1,122 @@
1from html import parser
2from typing import Dict, Any, List, Tuple
3import re
4import string
5
6from . import abstract
7
8
9class CSSParser(abstract.AbstractParser):
10 """There is no such things as metadata in CSS files,
11 only comments of the form `/* … */`, so we're removing the laters."""
12 mimetypes = {'text/css', }
13 flags = re.MULTILINE | re.DOTALL
14
15 def remove_all(self) -> bool:
16 with open(self.filename, encoding='utf-8') as f:
17 cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags)
18 with open(self.output_filename, 'w', encoding='utf-8') as f:
19 f.write(cleaned)
20 return True
21
22 def get_meta(self) -> Dict[str, Any]:
23 metadata = {}
24 with open(self.filename, encoding='utf-8') as f:
25 cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags)
26 for match in cssdoc:
27 for line in match.splitlines():
28 try:
29 k, v = line.split(':')
30 metadata[k.strip(string.whitespace + '*')] = v.strip()
31 except ValueError:
32 metadata['harmful data'] = line.strip()
33 return metadata
34
35
36class HTMLParser(abstract.AbstractParser):
37 mimetypes = {'text/html', 'application/x-dtbncx+xml', }
38 def __init__(self, filename):
39 super().__init__(filename)
40 self.__parser = _HTMLParser(self.filename)
41 with open(filename, encoding='utf-8') as f:
42 self.__parser.feed(f.read())
43 self.__parser.close()
44
45 def get_meta(self) -> Dict[str, Any]:
46 return self.__parser.get_meta()
47
48 def remove_all(self) -> bool:
49 return self.__parser.remove_all(self.output_filename)
50
51
52class _HTMLParser(parser.HTMLParser):
53 """Python doesn't have a validating html parser in its stdlib, so
54 we're using an internal queue to track all the opening/closing tags,
55 and hoping for the best.
56 """
57 tag_blacklist = {'doctitle', 'meta'} # everything is lowercase
58 def __init__(self, filename):
59 super().__init__()
60 self.filename = filename
61 self.__textrepr = ''
62 self.__meta = {}
63 self.__validation_queue = []
64 # We're using a counter instead of a boolean to handle nested tags
65 self.__in_dangerous_tag = 0
66
67 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
68 self.__validation_queue.append(tag)
69 if tag in self.tag_blacklist:
70 self.__in_dangerous_tag += 1
71 return
72
73 if self.__in_dangerous_tag == 0:
74 self.__textrepr += self.get_starttag_text()
75
76 def handle_endtag(self, tag: str):
77 if not self.__validation_queue:
78 raise ValueError("The closing tag %s doesn't have a corresponding "
79 "opening one in %s." % (tag, self.filename))
80
81 previous_tag = self.__validation_queue.pop()
82 if tag != previous_tag:
83 raise ValueError("The closing tag %s doesn't match the previous "
84 "tag %s in %s" %
85 (tag, previous_tag, self.filename))
86 elif tag in self.tag_blacklist:
87 self.__in_dangerous_tag -= 1
88 return
89
90 if self.__in_dangerous_tag == 0:
91 # There is no `get_endtag_text()` method :/
92 self.__textrepr += '</' + tag + '>\n'
93
94 def handle_data(self, data: str):
95 if self.__in_dangerous_tag == 0 and data.strip():
96 self.__textrepr += data
97
98 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
99 if tag in self.tag_blacklist:
100 meta = {k:v for k, v in attrs}
101 name = meta.get('name', 'harmful metadata')
102 content = meta.get('content', 'harmful data')
103 self.__meta[name] = content
104 else:
105 if self.__in_dangerous_tag == 0:
106 self.__textrepr += self.get_starttag_text()
107
108 def remove_all(self, output_filename: str) -> bool:
109 if self.__validation_queue:
110 raise ValueError("Some tags (%s) were left unclosed in %s" % (
111 ', '.join(self.__validation_queue),
112 self.filename))
113 with open(output_filename, 'w', encoding='utf-8') as f:
114 f.write(self.__textrepr)
115 return True
116
117 def get_meta(self) -> Dict[str, Any]:
118 if self.__validation_queue:
119 raise ValueError("Some tags (%s) were left unclosed in %s" % (
120 ', '.join(self.__validation_queue),
121 self.filename))
122 return self.__meta