From 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Wed, 20 Feb 2019 16:28:11 -0800
Subject: Implement epub support

---
 libmat2/web.py | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 libmat2/web.py

(limited to 'libmat2/web.py')

diff --git a/libmat2/web.py b/libmat2/web.py
new file mode 100644
index 0000000..13d5fc8
--- /dev/null
+++ b/libmat2/web.py
@@ -0,0 +1,122 @@
+from html import parser
+from typing import Dict, Any, List, Tuple
+import re
+import string
+
+from . import abstract
+
+
+class CSSParser(abstract.AbstractParser):
+    """There is no such things as metadata in CSS files,
+    only comments of the form `/* … */`, so we're removing the laters."""
+    mimetypes = {'text/css', }
+    flags = re.MULTILINE | re.DOTALL
+
+    def remove_all(self) -> bool:
+        with open(self.filename, encoding='utf-8') as f:
+            cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags)
+        with open(self.output_filename, 'w', encoding='utf-8') as f:
+            f.write(cleaned)
+        return True
+
+    def get_meta(self) -> Dict[str, Any]:
+        metadata = {}
+        with open(self.filename, encoding='utf-8') as f:
+            cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags)
+        for match in cssdoc:
+            for line in match.splitlines():
+                try:
+                    k, v = line.split(':')
+                    metadata[k.strip(string.whitespace + '*')] = v.strip()
+                except ValueError:
+                    metadata['harmful data'] = line.strip()
+        return metadata
+
+
+class HTMLParser(abstract.AbstractParser):
+    mimetypes = {'text/html', 'application/x-dtbncx+xml', }
+    def __init__(self, filename):
+        super().__init__(filename)
+        self.__parser = _HTMLParser(self.filename)
+        with open(filename, encoding='utf-8') as f:
+            self.__parser.feed(f.read())
+        self.__parser.close()
+
+    def get_meta(self) -> Dict[str, Any]:
+        return self.__parser.get_meta()
+
+    def remove_all(self) -> bool:
+        return self.__parser.remove_all(self.output_filename)
+
+
+class _HTMLParser(parser.HTMLParser):
+    """Python doesn't have a validating html parser in its stdlib, so
+    we're using an internal queue to track all the opening/closing tags,
+    and hoping for the best.
+    """
+    tag_blacklist = {'doctitle', 'meta'}  # everything is lowercase
+    def __init__(self, filename):
+        super().__init__()
+        self.filename = filename
+        self.__textrepr = ''
+        self.__meta = {}
+        self.__validation_queue = []
+        # We're using a counter instead of a boolean to handle nested tags
+        self.__in_dangerous_tag = 0
+
+    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
+        self.__validation_queue.append(tag)
+        if tag in self.tag_blacklist:
+            self.__in_dangerous_tag += 1
+            return
+
+        if self.__in_dangerous_tag == 0:
+            self.__textrepr += self.get_starttag_text()
+
+    def handle_endtag(self, tag: str):
+        if not self.__validation_queue:
+            raise ValueError("The closing tag %s doesn't have a corresponding "
+                             "opening one in %s." % (tag, self.filename))
+
+        previous_tag = self.__validation_queue.pop()
+        if tag != previous_tag:
+            raise ValueError("The closing tag %s doesn't match the previous "
+                             "tag %s in %s" %
+                             (tag, previous_tag, self.filename))
+        elif tag in self.tag_blacklist:
+            self.__in_dangerous_tag -= 1
+            return
+
+        if self.__in_dangerous_tag == 0:
+            # There is no `get_endtag_text()` method :/
+            self.__textrepr += '</' + tag + '>\n'
+
+    def handle_data(self, data: str):
+        if self.__in_dangerous_tag == 0 and data.strip():
+            self.__textrepr += data
+
+    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
+        if tag in self.tag_blacklist:
+            meta = {k:v for k, v in attrs}
+            name = meta.get('name', 'harmful metadata')
+            content = meta.get('content', 'harmful data')
+            self.__meta[name] = content
+        else:
+            if self.__in_dangerous_tag == 0:
+                self.__textrepr += self.get_starttag_text()
+
+    def remove_all(self, output_filename: str) -> bool:
+        if self.__validation_queue:
+            raise ValueError("Some tags (%s) were left unclosed in %s" % (
+                ', '.join(self.__validation_queue),
+                self.filename))
+        with open(output_filename, 'w', encoding='utf-8') as f:
+            f.write(self.__textrepr)
+        return True
+
+    def get_meta(self) -> Dict[str, Any]:
+        if self.__validation_queue:
+            raise ValueError("Some tags (%s) were left unclosed in %s" % (
+                ', '.join(self.__validation_queue),
+                self.filename))
+        return self.__meta
-- 
cgit v1.3