summaryrefslogtreecommitdiff
path: root/libmat2
diff options
context:
space:
mode:
authorjvoisin2019-02-20 16:28:11 -0800
committerjvoisin2019-02-20 16:28:11 -0800
commit02ff21b158c76fcd355a74ddb940e1c54fc2d7ed (patch)
tree701c6f5e316265e5a95a162356965ecf2fb8d6b2 /libmat2
parent6b45064c784d03bb21ffaf7e50c9ba684e6985a9 (diff)
Implement epub support
Diffstat (limited to 'libmat2')
-rw-r--r--libmat2/epub.py47
-rw-r--r--libmat2/html.py69
-rw-r--r--libmat2/parser_factory.py9
-rw-r--r--libmat2/web.py122
4 files changed, 177 insertions, 70 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
new file mode 100644
index 0000000..09b7937
--- /dev/null
+++ b/libmat2/epub.py
@@ -0,0 +1,47 @@
1import logging
2import re
3import xml.etree.ElementTree as ET # type: ignore
4
5from . import archive, office
6
7class EPUBParser(archive.ArchiveBasedAbstractParser):
8 mimetypes = {'application/epub+zip', }
9
10 def __init__(self, filename):
11 super().__init__(filename)
12 self.files_to_keep = set(map(re.compile, { # type: ignore
13 'META-INF/container.xml',
14 'mimetype',
15 'OEBPS/content.opf',
16 }))
17
18 def _specific_get_meta(self, full_path, file_path):
19 if file_path != 'OEBPS/content.opf':
20 return {}
21
22 with open(full_path, encoding='utf-8') as f:
23 try:
24 results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
25 f.read(), re.I|re.M)
26 return {k:v for (k, v) in results}
27 except (TypeError, UnicodeDecodeError):
28 # We didn't manage to parse the xml file
29 return {file_path: 'harmful content', }
30
31 def _specific_cleanup(self, full_path: str):
32 if not full_path.endswith('OEBPS/content.opf'):
33 return True
34
35 try:
36 tree, namespace = office._parse_xml(full_path)
37 except ET.ParseError:
38 logging.error("Unable to parse %s in %s.", full_path, self.filename)
39 return False
40 parent_map = {c:p for p in tree.iter() for c in p}
41
42 for item in tree.iterfind('.//', namespace):
43 if item.tag.strip().lower().endswith('metadata'):
44 parent_map[item].remove(item)
45 break # there is only a single <metadata> block
46 tree.write(full_path, xml_declaration=True)
47 return True
diff --git a/libmat2/html.py b/libmat2/html.py
deleted file mode 100644
index d0e9a2b..0000000
--- a/libmat2/html.py
+++ /dev/null
@@ -1,69 +0,0 @@
1from html import parser
2from typing import Dict, Any, List, Tuple
3
4from . import abstract
5
6
7class HTMLParser(abstract.AbstractParser):
8 mimetypes = {'text/html', }
9 def __init__(self, filename):
10 super().__init__(filename)
11 self.__parser = _HTMLParser()
12 with open(filename) as f:
13 self.__parser.feed(f.read())
14 self.__parser.close()
15
16 def get_meta(self) -> Dict[str, Any]:
17 return self.__parser.get_meta()
18
19 def remove_all(self) -> bool:
20 return self.__parser.remove_all(self.output_filename)
21
22
23class _HTMLParser(parser.HTMLParser):
24 """Python doesn't have a validating html parser in its stdlib, so
25 we're using an internal queue to track all the opening/closing tags,
26 and hoping for the best.
27 """
28 def __init__(self):
29 super().__init__()
30 self.__textrepr = ''
31 self.__meta = {}
32 self.__validation_queue = []
33
34 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
35 self.__textrepr += self.get_starttag_text()
36 self.__validation_queue.append(tag)
37
38 def handle_endtag(self, tag: str):
39 if not self.__validation_queue:
40 raise ValueError
41 elif tag != self.__validation_queue.pop():
42 raise ValueError
43 # There is no `get_endtag_text()` method :/
44 self.__textrepr += '</' + tag + '>\n'
45
46 def handle_data(self, data: str):
47 if data.strip():
48 self.__textrepr += data
49
50 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
51 if tag == 'meta':
52 meta = {k:v for k, v in attrs}
53 name = meta.get('name', 'harmful metadata')
54 content = meta.get('content', 'harmful data')
55 self.__meta[name] = content
56 else:
57 self.__textrepr += self.get_starttag_text()
58
59 def remove_all(self, output_filename: str) -> bool:
60 if self.__validation_queue:
61 raise ValueError
62 with open(output_filename, 'w') as f:
63 f.write(self.__textrepr)
64 return True
65
66 def get_meta(self) -> Dict[str, Any]:
67 if self.__validation_queue:
68 raise ValueError
69 return self.__meta
diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py
index 30c3b52..e93ee4f 100644
--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
@@ -1,3 +1,4 @@
1import logging
1import glob 2import glob
2import os 3import os
3import mimetypes 4import mimetypes
@@ -10,6 +11,10 @@ assert Tuple # make pyflakes happy
10 11
11T = TypeVar('T', bound='abstract.AbstractParser') 12T = TypeVar('T', bound='abstract.AbstractParser')
12 13
14mimetypes.add_type('application/epub+zip', '.epub')
15# EPUB Navigation Control XML File
16mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
17
13 18
14def __load_all_parsers(): 19def __load_all_parsers():
15 """ Loads every parser in a dynamic way """ 20 """ Loads every parser in a dynamic way """
@@ -49,6 +54,8 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
49 if mtype in parser_class.mimetypes: 54 if mtype in parser_class.mimetypes:
50 try: 55 try:
51 return parser_class(filename), mtype 56 return parser_class(filename), mtype
52 except ValueError: 57 except ValueError as e:
58 logging.info("Got an exception when trying to instanciate "
59 "%s for %s: %s", parser_class, filename, e)
53 return None, mtype 60 return None, mtype
54 return None, mtype 61 return None, mtype
diff --git a/libmat2/web.py b/libmat2/web.py
new file mode 100644
index 0000000..13d5fc8
--- /dev/null
+++ b/libmat2/web.py
@@ -0,0 +1,122 @@
1from html import parser
2from typing import Dict, Any, List, Tuple
3import re
4import string
5
6from . import abstract
7
8
9class CSSParser(abstract.AbstractParser):
10 """There is no such things as metadata in CSS files,
11 only comments of the form `/* … */`, so we're removing the laters."""
12 mimetypes = {'text/css', }
13 flags = re.MULTILINE | re.DOTALL
14
15 def remove_all(self) -> bool:
16 with open(self.filename, encoding='utf-8') as f:
17 cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags)
18 with open(self.output_filename, 'w', encoding='utf-8') as f:
19 f.write(cleaned)
20 return True
21
22 def get_meta(self) -> Dict[str, Any]:
23 metadata = {}
24 with open(self.filename, encoding='utf-8') as f:
25 cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags)
26 for match in cssdoc:
27 for line in match.splitlines():
28 try:
29 k, v = line.split(':')
30 metadata[k.strip(string.whitespace + '*')] = v.strip()
31 except ValueError:
32 metadata['harmful data'] = line.strip()
33 return metadata
34
35
36class HTMLParser(abstract.AbstractParser):
37 mimetypes = {'text/html', 'application/x-dtbncx+xml', }
38 def __init__(self, filename):
39 super().__init__(filename)
40 self.__parser = _HTMLParser(self.filename)
41 with open(filename, encoding='utf-8') as f:
42 self.__parser.feed(f.read())
43 self.__parser.close()
44
45 def get_meta(self) -> Dict[str, Any]:
46 return self.__parser.get_meta()
47
48 def remove_all(self) -> bool:
49 return self.__parser.remove_all(self.output_filename)
50
51
52class _HTMLParser(parser.HTMLParser):
53 """Python doesn't have a validating html parser in its stdlib, so
54 we're using an internal queue to track all the opening/closing tags,
55 and hoping for the best.
56 """
57 tag_blacklist = {'doctitle', 'meta'} # everything is lowercase
58 def __init__(self, filename):
59 super().__init__()
60 self.filename = filename
61 self.__textrepr = ''
62 self.__meta = {}
63 self.__validation_queue = []
64 # We're using a counter instead of a boolean to handle nested tags
65 self.__in_dangerous_tag = 0
66
67 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
68 self.__validation_queue.append(tag)
69 if tag in self.tag_blacklist:
70 self.__in_dangerous_tag += 1
71 return
72
73 if self.__in_dangerous_tag == 0:
74 self.__textrepr += self.get_starttag_text()
75
76 def handle_endtag(self, tag: str):
77 if not self.__validation_queue:
78 raise ValueError("The closing tag %s doesn't have a corresponding "
79 "opening one in %s." % (tag, self.filename))
80
81 previous_tag = self.__validation_queue.pop()
82 if tag != previous_tag:
83 raise ValueError("The closing tag %s doesn't match the previous "
84 "tag %s in %s" %
85 (tag, previous_tag, self.filename))
86 elif tag in self.tag_blacklist:
87 self.__in_dangerous_tag -= 1
88 return
89
90 if self.__in_dangerous_tag == 0:
91 # There is no `get_endtag_text()` method :/
92 self.__textrepr += '</' + tag + '>\n'
93
94 def handle_data(self, data: str):
95 if self.__in_dangerous_tag == 0 and data.strip():
96 self.__textrepr += data
97
98 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
99 if tag in self.tag_blacklist:
100 meta = {k:v for k, v in attrs}
101 name = meta.get('name', 'harmful metadata')
102 content = meta.get('content', 'harmful data')
103 self.__meta[name] = content
104 else:
105 if self.__in_dangerous_tag == 0:
106 self.__textrepr += self.get_starttag_text()
107
108 def remove_all(self, output_filename: str) -> bool:
109 if self.__validation_queue:
110 raise ValueError("Some tags (%s) were left unclosed in %s" % (
111 ', '.join(self.__validation_queue),
112 self.filename))
113 with open(output_filename, 'w', encoding='utf-8') as f:
114 f.write(self.__textrepr)
115 return True
116
117 def get_meta(self) -> Dict[str, Any]:
118 if self.__validation_queue:
119 raise ValueError("Some tags (%s) were left unclosed in %s" % (
120 ', '.join(self.__validation_queue),
121 self.filename))
122 return self.__meta