Implement epub support

author: jvoisin 2019-02-20 16:28:11 -0800
committer: jvoisin 2019-02-20 16:28:11 -0800
commit: 02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch)
tree: 701c6f5e316265e5a95a162356965ecf2fb8d6b2 /libmat2/web.py
parent: 6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff)
1 files changed, 122 insertions, 0 deletions
diff --git a/libmat2/web.py b/libmat2/web.py
new file mode 100644
index 0000000..13d5fc8
--- /dev/null
+++ b/libmat2/web.py
@@ -0,0 +1,122 @@
+from html import parser
+from typing import Dict, Any, List, Tuple
+import re
+import string
+from . import abstract
+class CSSParser(abstract.AbstractParser):
+    """There is no such things as metadata in CSS files,
+    only comments of the form `/* … */`, so we're removing the laters."""
+    mimetypes = {'text/css', }
+    flags = re.MULTILINE | re.DOTALL
+    def remove_all(self) -> bool:
+        with open(self.filename, encoding='utf-8') as f:
+            cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags)
+        with open(self.output_filename, 'w', encoding='utf-8') as f:
+            f.write(cleaned)
+        return True
+    def get_meta(self) -> Dict[str, Any]:
+        metadata = {}
+        with open(self.filename, encoding='utf-8') as f:
+            cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags)
+        for match in cssdoc:
+            for line in match.splitlines():
+                try:
+                    k, v = line.split(':')
+                    metadata[k.strip(string.whitespace + '*')] = v.strip()
+                except ValueError:
+                    metadata['harmful data'] = line.strip()
+        return metadata
+class HTMLParser(abstract.AbstractParser):
+    mimetypes = {'text/html', 'application/x-dtbncx+xml', }
+    def __init__(self, filename):
+        super().__init__(filename)
+        self.__parser = _HTMLParser(self.filename)
+        with open(filename, encoding='utf-8') as f:
+            self.__parser.feed(f.read())
+        self.__parser.close()
+    def get_meta(self) -> Dict[str, Any]:
+        return self.__parser.get_meta()
+    def remove_all(self) -> bool:
+        return self.__parser.remove_all(self.output_filename)
+class _HTMLParser(parser.HTMLParser):
+    """Python doesn't have a validating html parser in its stdlib, so
+    we're using an internal queue to track all the opening/closing tags,
+    and hoping for the best.
+    """
+    tag_blacklist = {'doctitle', 'meta'}  # everything is lowercase
+    def __init__(self, filename):
+        super().__init__()
+        self.filename = filename
+        self.__textrepr = ''
+        self.__meta = {}
+        self.__validation_queue = []
+        # We're using a counter instead of a boolean to handle nested tags
+        self.__in_dangerous_tag = 0
+    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
+        self.__validation_queue.append(tag)
+        if tag in self.tag_blacklist:
+            self.__in_dangerous_tag += 1
+            return
+        if self.__in_dangerous_tag == 0:
+            self.__textrepr += self.get_starttag_text()
+    def handle_endtag(self, tag: str):
+        if not self.__validation_queue:
+            raise ValueError("The closing tag %s doesn't have a corresponding "
+                             "opening one in %s." % (tag, self.filename))
+        previous_tag = self.__validation_queue.pop()
+        if tag != previous_tag:
+            raise ValueError("The closing tag %s doesn't match the previous "
+                             "tag %s in %s" %
+                             (tag, previous_tag, self.filename))
+        elif tag in self.tag_blacklist:
+            self.__in_dangerous_tag -= 1
+            return
+        if self.__in_dangerous_tag == 0:
+            # There is no `get_endtag_text()` method :/
+            self.__textrepr += '</' + tag + '>\n'
+    def handle_data(self, data: str):
+        if self.__in_dangerous_tag == 0 and data.strip():
+            self.__textrepr += data
+    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
+        if tag in self.tag_blacklist:
+            meta = {k:v for k, v in attrs}
+            name = meta.get('name', 'harmful metadata')
+            content = meta.get('content', 'harmful data')
+            self.__meta[name] = content
+        else:
+            if self.__in_dangerous_tag == 0:
+                self.__textrepr += self.get_starttag_text()
+    def remove_all(self, output_filename: str) -> bool:
+        if self.__validation_queue:
+            raise ValueError("Some tags (%s) were left unclosed in %s" % (
+                ', '.join(self.__validation_queue),
+                self.filename))
+        with open(output_filename, 'w', encoding='utf-8') as f:
+            f.write(self.__textrepr)
+        return True
+    def get_meta(self) -> Dict[str, Any]:
+        if self.__validation_queue:
+            raise ValueError("Some tags (%s) were left unclosed in %s" % (
+                ', '.join(self.__validation_queue),
+                self.filename))
+        return self.__meta
author	jvoisin	2019-02-20 16:28:11 -0800
committer	jvoisin	2019-02-20 16:28:11 -0800
commit	02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch)
tree	701c6f5e316265e5a95a162356965ecf2fb8d6b2 /libmat2/web.py
parent	6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff)

diff --git a/libmat2/web.py b/libmat2/web.py new file mode 100644 index 0000000..13d5fc8 --- /dev/null +++ b/libmat2/web.py
@@ -0,0 +1,122 @@
	1	from html import parser
	2	from typing import Dict, Any, List, Tuple
	3	import re
	4	import string
	5
	6	from . import abstract
	7
	8
	9	class CSSParser(abstract.AbstractParser):
	10	"""There is no such things as metadata in CSS files,
	11	only comments of the form `/* … */`, so we're removing the laters."""
	12	mimetypes = {'text/css', }
	13	flags = re.MULTILINE \| re.DOTALL
	14
	15	def remove_all(self) -> bool:
	16	with open(self.filename, encoding='utf-8') as f:
	17	cleaned = re.sub(r'/\.+?\/', '', f.read(), 0, self.flags)
	18	with open(self.output_filename, 'w', encoding='utf-8') as f:
	19	f.write(cleaned)
	20	return True
	21
	22	def get_meta(self) -> Dict[str, Any]:
	23	metadata = {}
	24	with open(self.filename, encoding='utf-8') as f:
	25	cssdoc = re.findall(r'/\(.+?)\/', f.read(), self.flags)
	26	for match in cssdoc:
	27	for line in match.splitlines():
	28	try:
	29	k, v = line.split(':')
	30	metadata[k.strip(string.whitespace + '*')] = v.strip()
	31	except ValueError:
	32	metadata['harmful data'] = line.strip()
	33	return metadata
	34
	35
	36	class HTMLParser(abstract.AbstractParser):
	37	mimetypes = {'text/html', 'application/x-dtbncx+xml', }
	38	def __init__(self, filename):
	39	super().__init__(filename)
	40	self.__parser = _HTMLParser(self.filename)
	41	with open(filename, encoding='utf-8') as f:
	42	self.__parser.feed(f.read())
	43	self.__parser.close()
	44
	45	def get_meta(self) -> Dict[str, Any]:
	46	return self.__parser.get_meta()
	47
	48	def remove_all(self) -> bool:
	49	return self.__parser.remove_all(self.output_filename)
	50
	51
	52	class _HTMLParser(parser.HTMLParser):
	53	"""Python doesn't have a validating html parser in its stdlib, so
	54	we're using an internal queue to track all the opening/closing tags,
	55	and hoping for the best.
	56	"""
	57	tag_blacklist = {'doctitle', 'meta'} # everything is lowercase
	58	def __init__(self, filename):
	59	super().__init__()
	60	self.filename = filename
	61	self.__textrepr = ''
	62	self.__meta = {}
	63	self.__validation_queue = []
	64	# We're using a counter instead of a boolean to handle nested tags
	65	self.__in_dangerous_tag = 0
	66
	67	def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
	68	self.__validation_queue.append(tag)
	69	if tag in self.tag_blacklist:
	70	self.__in_dangerous_tag += 1
	71	return
	72
	73	if self.__in_dangerous_tag == 0:
	74	self.__textrepr += self.get_starttag_text()
	75
	76	def handle_endtag(self, tag: str):
	77	if not self.__validation_queue:
	78	raise ValueError("The closing tag %s doesn't have a corresponding "
	79	"opening one in %s." % (tag, self.filename))
	80
	81	previous_tag = self.__validation_queue.pop()
	82	if tag != previous_tag:
	83	raise ValueError("The closing tag %s doesn't match the previous "
	84	"tag %s in %s" %
	85	(tag, previous_tag, self.filename))
	86	elif tag in self.tag_blacklist:
	87	self.__in_dangerous_tag -= 1
	88	return
	89
	90	if self.__in_dangerous_tag == 0:
	91	# There is no `get_endtag_text()` method :/
	92	self.__textrepr += '</' + tag + '>\n'
	93
	94	def handle_data(self, data: str):
	95	if self.__in_dangerous_tag == 0 and data.strip():
	96	self.__textrepr += data
	97
	98	def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
	99	if tag in self.tag_blacklist:
	100	meta = {k:v for k, v in attrs}
	101	name = meta.get('name', 'harmful metadata')
	102	content = meta.get('content', 'harmful data')
	103	self.__meta[name] = content
	104	else:
	105	if self.__in_dangerous_tag == 0:
	106	self.__textrepr += self.get_starttag_text()
	107
	108	def remove_all(self, output_filename: str) -> bool:
	109	if self.__validation_queue:
	110	raise ValueError("Some tags (%s) were left unclosed in %s" % (
	111	', '.join(self.__validation_queue),
	112	self.filename))
	113	with open(output_filename, 'w', encoding='utf-8') as f:
	114	f.write(self.__textrepr)
	115	return True
	116
	117	def get_meta(self) -> Dict[str, Any]:
	118	if self.__validation_queue:
	119	raise ValueError("Some tags (%s) were left unclosed in %s" % (
	120	', '.join(self.__validation_queue),
	121	self.filename))
	122	return self.__meta