summaryrefslogtreecommitdiff
path: root/libmat2
diff options
context:
space:
mode:
Diffstat (limited to 'libmat2')
-rw-r--r--libmat2/epub.py46
-rw-r--r--libmat2/web.py87
2 files changed, 104 insertions, 29 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
index 09b7937..d385465 100644
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -1,11 +1,13 @@
1import logging 1import logging
2import re 2import re
3import uuid
3import xml.etree.ElementTree as ET # type: ignore 4import xml.etree.ElementTree as ET # type: ignore
4 5
5from . import archive, office 6from . import archive, office
6 7
7class EPUBParser(archive.ArchiveBasedAbstractParser): 8class EPUBParser(archive.ArchiveBasedAbstractParser):
8 mimetypes = {'application/epub+zip', } 9 mimetypes = {'application/epub+zip', }
10 metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
9 11
10 def __init__(self, filename): 12 def __init__(self, filename):
11 super().__init__(filename) 13 super().__init__(filename)
@@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
14 'mimetype', 16 'mimetype',
15 'OEBPS/content.opf', 17 'OEBPS/content.opf',
16 })) 18 }))
19 self.uniqid = uuid.uuid4()
17 20
18 def _specific_get_meta(self, full_path, file_path): 21 def _specific_get_meta(self, full_path, file_path):
19 if file_path != 'OEBPS/content.opf': 22 if file_path != 'OEBPS/content.opf':
@@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
25 f.read(), re.I|re.M) 28 f.read(), re.I|re.M)
26 return {k:v for (k, v) in results} 29 return {k:v for (k, v) in results}
27 except (TypeError, UnicodeDecodeError): 30 except (TypeError, UnicodeDecodeError):
28 # We didn't manage to parse the xml file
29 return {file_path: 'harmful content', } 31 return {file_path: 'harmful content', }
30 32
31 def _specific_cleanup(self, full_path: str): 33 def _specific_cleanup(self, full_path: str):
32 if not full_path.endswith('OEBPS/content.opf'): 34 if full_path.endswith('OEBPS/content.opf'):
33 return True 35 return self.__handle_contentopf(full_path)
36 elif full_path.endswith('OEBPS/toc.ncx'):
37 return self.__handle_tocncx(full_path)
38 return True
39
40 def __handle_tocncx(self, full_path: str):
41 try:
42 tree, namespace = office._parse_xml(full_path)
43 except ET.ParseError: # pragma: nocover
44 logging.error("Unable to parse %s in %s.", full_path, self.filename)
45 return False
46
47 for item in tree.iterfind('.//', namespace): # pragma: nocover
48 if item.tag.strip().lower().endswith('head'):
49 item.clear()
50 ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
51 break
52 tree.write(full_path, xml_declaration=True, encoding='utf-8',
53 short_empty_elements=False)
54 return True
34 55
56 def __handle_contentopf(self, full_path: str):
35 try: 57 try:
36 tree, namespace = office._parse_xml(full_path) 58 tree, namespace = office._parse_xml(full_path)
37 except ET.ParseError: 59 except ET.ParseError:
38 logging.error("Unable to parse %s in %s.", full_path, self.filename) 60 logging.error("Unable to parse %s in %s.", full_path, self.filename)
39 return False 61 return False
40 parent_map = {c:p for p in tree.iter() for c in p}
41 62
42 for item in tree.iterfind('.//', namespace): 63 for item in tree.iterfind('.//', namespace): # pragma: nocover
43 if item.tag.strip().lower().endswith('metadata'): 64 if item.tag.strip().lower().endswith('metadata'):
44 parent_map[item].remove(item) 65 item.clear()
66
67 # item with mandatory content
68 uniqid = ET.Element(self.metadata_namespace + 'identifier')
69 uniqid.text = str(self.uniqid)
70 uniqid.set('id', 'id')
71 item.append(uniqid)
72
73 # items without mandatory content
74 for name in {'language', 'title'}:
75 uniqid = ET.Element(self.metadata_namespace + name)
76 item.append(uniqid)
45 break # there is only a single <metadata> block 77 break # there is only a single <metadata> block
46 tree.write(full_path, xml_declaration=True) 78 tree.write(full_path, xml_declaration=True, encoding='utf-8')
47 return True 79 return True
diff --git a/libmat2/web.py b/libmat2/web.py
index c11b47d..067f5f9 100644
--- a/libmat2/web.py
+++ b/libmat2/web.py
@@ -1,10 +1,13 @@
1from html import parser 1from html import parser, escape
2from typing import Dict, Any, List, Tuple 2from typing import Dict, Any, List, Tuple, Set
3import re 3import re
4import string 4import string
5 5
6from . import abstract 6from . import abstract
7 7
8assert Set
9
10# pylint: disable=too-many-instance-attributes
8 11
9class CSSParser(abstract.AbstractParser): 12class CSSParser(abstract.AbstractParser):
10 """There is no such things as metadata in CSS files, 13 """There is no such things as metadata in CSS files,
@@ -33,11 +36,16 @@ class CSSParser(abstract.AbstractParser):
33 return metadata 36 return metadata
34 37
35 38
36class HTMLParser(abstract.AbstractParser): 39class AbstractHTMLParser(abstract.AbstractParser):
37 mimetypes = {'text/html', 'application/x-dtbncx+xml', } 40 tags_blacklist = set() # type: Set[str]
41 # In some html/xml based formats some tags are mandatory,
42 # so we're keeping them, but are discaring their contents
43 tags_required_blacklist = set() # type: Set[str]
44
38 def __init__(self, filename): 45 def __init__(self, filename):
39 super().__init__(filename) 46 super().__init__(filename)
40 self.__parser = _HTMLParser(self.filename) 47 self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
48 self.tags_required_blacklist)
41 with open(filename, encoding='utf-8') as f: 49 with open(filename, encoding='utf-8') as f:
42 self.__parser.feed(f.read()) 50 self.__parser.feed(f.read())
43 self.__parser.close() 51 self.__parser.close()
@@ -49,29 +57,50 @@ class HTMLParser(abstract.AbstractParser):
49 return self.__parser.remove_all(self.output_filename) 57 return self.__parser.remove_all(self.output_filename)
50 58
51 59
60class HTMLParser(AbstractHTMLParser):
61 mimetypes = {'text/html', }
62 tags_blacklist = {'meta', }
63 tags_required_blacklist = {'title', }
64
65
66class DTBNCXParser(AbstractHTMLParser):
67 mimetypes = {'application/x-dtbncx+xml', }
68 tags_required_blacklist = {'title', 'doctitle', 'meta'}
69
70
52class _HTMLParser(parser.HTMLParser): 71class _HTMLParser(parser.HTMLParser):
53 """Python doesn't have a validating html parser in its stdlib, so 72 """Python doesn't have a validating html parser in its stdlib, so
54 we're using an internal queue to track all the opening/closing tags, 73 we're using an internal queue to track all the opening/closing tags,
55 and hoping for the best. 74 and hoping for the best.
56 """ 75 """
57 tag_blacklist = {'doctitle', 'meta', 'title'} # everything is lowercase 76 def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
58 def __init__(self, filename):
59 super().__init__() 77 super().__init__()
60 self.filename = filename 78 self.filename = filename
61 self.__textrepr = '' 79 self.__textrepr = ''
62 self.__meta = {} 80 self.__meta = {}
63 self.__validation_queue = [] 81 self.__validation_queue = [] # type: List[str]
64 # We're using a counter instead of a boolean to handle nested tags 82 # We're using counters instead of booleans, to handle nested tags
83 self.__in_dangerous_but_required_tag = 0
65 self.__in_dangerous_tag = 0 84 self.__in_dangerous_tag = 0
66 85
86 if required_blacklisted_tags & blacklisted_tags: # pragma: nocover
87 raise ValueError("There is an overlap between %s and %s" % (
88 required_blacklisted_tags, blacklisted_tags))
89 self.tag_required_blacklist = required_blacklisted_tags
90 self.tag_blacklist = blacklisted_tags
91
67 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): 92 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
68 self.__validation_queue.append(tag) 93 original_tag = self.get_starttag_text()
94 self.__validation_queue.append(original_tag)
95
96 if tag in self.tag_required_blacklist:
97 self.__in_dangerous_but_required_tag += 1
69 if tag in self.tag_blacklist: 98 if tag in self.tag_blacklist:
70 self.__in_dangerous_tag += 1 99 self.__in_dangerous_tag += 1
71 return
72 100
73 if self.__in_dangerous_tag == 0: 101 if self.__in_dangerous_tag == 0:
74 self.__textrepr += self.get_starttag_text() 102 if self.__in_dangerous_but_required_tag <= 1:
103 self.__textrepr += original_tag
75 104
76 def handle_endtag(self, tag: str): 105 def handle_endtag(self, tag: str):
77 if not self.__validation_queue: 106 if not self.__validation_queue:
@@ -79,29 +108,43 @@ class _HTMLParser(parser.HTMLParser):
79 "opening one in %s." % (tag, self.filename)) 108 "opening one in %s." % (tag, self.filename))
80 109
81 previous_tag = self.__validation_queue.pop() 110 previous_tag = self.__validation_queue.pop()
82 if tag != previous_tag: 111 previous_tag = previous_tag[1:-1] # remove < and >
112 previous_tag = previous_tag.split(' ')[0] # remove attributes
113 if tag != previous_tag.lower():
83 raise ValueError("The closing tag %s doesn't match the previous " 114 raise ValueError("The closing tag %s doesn't match the previous "
84 "tag %s in %s" % 115 "tag %s in %s" %
85 (tag, previous_tag, self.filename)) 116 (tag, previous_tag, self.filename))
86 elif tag in self.tag_blacklist:
87 self.__in_dangerous_tag -= 1
88 return
89 117
90 if self.__in_dangerous_tag == 0: 118 if self.__in_dangerous_tag == 0:
91 # There is no `get_endtag_text()` method :/ 119 if self.__in_dangerous_but_required_tag <= 1:
92 self.__textrepr += '</' + tag + '>\n' 120 # There is no `get_endtag_text()` method :/
121 self.__textrepr += '</' + previous_tag + '>'
122
123 if tag in self.tag_required_blacklist:
124 self.__in_dangerous_but_required_tag -= 1
125 elif tag in self.tag_blacklist:
126 self.__in_dangerous_tag -= 1
93 127
94 def handle_data(self, data: str): 128 def handle_data(self, data: str):
95 if self.__in_dangerous_tag == 0 and data.strip(): 129 if self.__in_dangerous_but_required_tag == 0:
96 self.__textrepr += data 130 if self.__in_dangerous_tag == 0:
131 if data.strip():
132 self.__textrepr += escape(data)
97 133
98 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): 134 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
99 if tag in self.tag_blacklist: 135 if tag in self.tag_required_blacklist | self.tag_blacklist:
100 meta = {k:v for k, v in attrs} 136 meta = {k:v for k, v in attrs}
101 name = meta.get('name', 'harmful metadata') 137 name = meta.get('name', 'harmful metadata')
102 content = meta.get('content', 'harmful data') 138 content = meta.get('content', 'harmful data')
103 self.__meta[name] = content 139 self.__meta[name] = content
104 else: 140
141 if self.__in_dangerous_tag != 0:
142 return
143 elif tag in self.tag_required_blacklist:
144 self.__textrepr += '<' + tag + ' />'
145 return
146
147 if self.__in_dangerous_but_required_tag == 0:
105 if self.__in_dangerous_tag == 0: 148 if self.__in_dangerous_tag == 0:
106 self.__textrepr += self.get_starttag_text() 149 self.__textrepr += self.get_starttag_text()
107 150