diff options
| author | jvoisin | 2019-02-20 16:28:11 -0800 |
|---|---|---|
| committer | jvoisin | 2019-02-20 16:28:11 -0800 |
| commit | 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch) | |
| tree | 701c6f5e316265e5a95a162356965ecf2fb8d6b2 /libmat2/web.py | |
| parent | 6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff) | |
Implement epub support
Diffstat (limited to 'libmat2/web.py')
| -rw-r--r-- | libmat2/web.py | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/libmat2/web.py b/libmat2/web.py new file mode 100644 index 0000000..13d5fc8 --- /dev/null +++ b/libmat2/web.py | |||
| @@ -0,0 +1,122 @@ | |||
| 1 | from html import parser | ||
| 2 | from typing import Dict, Any, List, Tuple | ||
| 3 | import re | ||
| 4 | import string | ||
| 5 | |||
| 6 | from . import abstract | ||
| 7 | |||
| 8 | |||
| 9 | class CSSParser(abstract.AbstractParser): | ||
| 10 | """There is no such things as metadata in CSS files, | ||
| 11 | only comments of the form `/* … */`, so we're removing the laters.""" | ||
| 12 | mimetypes = {'text/css', } | ||
| 13 | flags = re.MULTILINE | re.DOTALL | ||
| 14 | |||
| 15 | def remove_all(self) -> bool: | ||
| 16 | with open(self.filename, encoding='utf-8') as f: | ||
| 17 | cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags) | ||
| 18 | with open(self.output_filename, 'w', encoding='utf-8') as f: | ||
| 19 | f.write(cleaned) | ||
| 20 | return True | ||
| 21 | |||
| 22 | def get_meta(self) -> Dict[str, Any]: | ||
| 23 | metadata = {} | ||
| 24 | with open(self.filename, encoding='utf-8') as f: | ||
| 25 | cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags) | ||
| 26 | for match in cssdoc: | ||
| 27 | for line in match.splitlines(): | ||
| 28 | try: | ||
| 29 | k, v = line.split(':') | ||
| 30 | metadata[k.strip(string.whitespace + '*')] = v.strip() | ||
| 31 | except ValueError: | ||
| 32 | metadata['harmful data'] = line.strip() | ||
| 33 | return metadata | ||
| 34 | |||
| 35 | |||
| 36 | class HTMLParser(abstract.AbstractParser): | ||
| 37 | mimetypes = {'text/html', 'application/x-dtbncx+xml', } | ||
| 38 | def __init__(self, filename): | ||
| 39 | super().__init__(filename) | ||
| 40 | self.__parser = _HTMLParser(self.filename) | ||
| 41 | with open(filename, encoding='utf-8') as f: | ||
| 42 | self.__parser.feed(f.read()) | ||
| 43 | self.__parser.close() | ||
| 44 | |||
| 45 | def get_meta(self) -> Dict[str, Any]: | ||
| 46 | return self.__parser.get_meta() | ||
| 47 | |||
| 48 | def remove_all(self) -> bool: | ||
| 49 | return self.__parser.remove_all(self.output_filename) | ||
| 50 | |||
| 51 | |||
| 52 | class _HTMLParser(parser.HTMLParser): | ||
| 53 | """Python doesn't have a validating html parser in its stdlib, so | ||
| 54 | we're using an internal queue to track all the opening/closing tags, | ||
| 55 | and hoping for the best. | ||
| 56 | """ | ||
| 57 | tag_blacklist = {'doctitle', 'meta'} # everything is lowercase | ||
| 58 | def __init__(self, filename): | ||
| 59 | super().__init__() | ||
| 60 | self.filename = filename | ||
| 61 | self.__textrepr = '' | ||
| 62 | self.__meta = {} | ||
| 63 | self.__validation_queue = [] | ||
| 64 | # We're using a counter instead of a boolean to handle nested tags | ||
| 65 | self.__in_dangerous_tag = 0 | ||
| 66 | |||
| 67 | def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): | ||
| 68 | self.__validation_queue.append(tag) | ||
| 69 | if tag in self.tag_blacklist: | ||
| 70 | self.__in_dangerous_tag += 1 | ||
| 71 | return | ||
| 72 | |||
| 73 | if self.__in_dangerous_tag == 0: | ||
| 74 | self.__textrepr += self.get_starttag_text() | ||
| 75 | |||
| 76 | def handle_endtag(self, tag: str): | ||
| 77 | if not self.__validation_queue: | ||
| 78 | raise ValueError("The closing tag %s doesn't have a corresponding " | ||
| 79 | "opening one in %s." % (tag, self.filename)) | ||
| 80 | |||
| 81 | previous_tag = self.__validation_queue.pop() | ||
| 82 | if tag != previous_tag: | ||
| 83 | raise ValueError("The closing tag %s doesn't match the previous " | ||
| 84 | "tag %s in %s" % | ||
| 85 | (tag, previous_tag, self.filename)) | ||
| 86 | elif tag in self.tag_blacklist: | ||
| 87 | self.__in_dangerous_tag -= 1 | ||
| 88 | return | ||
| 89 | |||
| 90 | if self.__in_dangerous_tag == 0: | ||
| 91 | # There is no `get_endtag_text()` method :/ | ||
| 92 | self.__textrepr += '</' + tag + '>\n' | ||
| 93 | |||
| 94 | def handle_data(self, data: str): | ||
| 95 | if self.__in_dangerous_tag == 0 and data.strip(): | ||
| 96 | self.__textrepr += data | ||
| 97 | |||
| 98 | def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): | ||
| 99 | if tag in self.tag_blacklist: | ||
| 100 | meta = {k:v for k, v in attrs} | ||
| 101 | name = meta.get('name', 'harmful metadata') | ||
| 102 | content = meta.get('content', 'harmful data') | ||
| 103 | self.__meta[name] = content | ||
| 104 | else: | ||
| 105 | if self.__in_dangerous_tag == 0: | ||
| 106 | self.__textrepr += self.get_starttag_text() | ||
| 107 | |||
| 108 | def remove_all(self, output_filename: str) -> bool: | ||
| 109 | if self.__validation_queue: | ||
| 110 | raise ValueError("Some tags (%s) were left unclosed in %s" % ( | ||
| 111 | ', '.join(self.__validation_queue), | ||
| 112 | self.filename)) | ||
| 113 | with open(output_filename, 'w', encoding='utf-8') as f: | ||
| 114 | f.write(self.__textrepr) | ||
| 115 | return True | ||
| 116 | |||
| 117 | def get_meta(self) -> Dict[str, Any]: | ||
| 118 | if self.__validation_queue: | ||
| 119 | raise ValueError("Some tags (%s) were left unclosed in %s" % ( | ||
| 120 | ', '.join(self.__validation_queue), | ||
| 121 | self.filename)) | ||
| 122 | return self.__meta | ||
