summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libmat2/epub.py46
-rw-r--r--libmat2/web.py87
-rw-r--r--tests/test_corrupted_files.py7
-rw-r--r--tests/test_libmat2.py6
4 files changed, 114 insertions, 32 deletions
diff --git a/libmat2/epub.py b/libmat2/epub.py
index 09b7937..d385465 100644
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -1,11 +1,13 @@
1import logging 1import logging
2import re 2import re
3import uuid
3import xml.etree.ElementTree as ET # type: ignore 4import xml.etree.ElementTree as ET # type: ignore
4 5
5from . import archive, office 6from . import archive, office
6 7
7class EPUBParser(archive.ArchiveBasedAbstractParser): 8class EPUBParser(archive.ArchiveBasedAbstractParser):
8 mimetypes = {'application/epub+zip', } 9 mimetypes = {'application/epub+zip', }
10 metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
9 11
10 def __init__(self, filename): 12 def __init__(self, filename):
11 super().__init__(filename) 13 super().__init__(filename)
@@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
14 'mimetype', 16 'mimetype',
15 'OEBPS/content.opf', 17 'OEBPS/content.opf',
16 })) 18 }))
19 self.uniqid = uuid.uuid4()
17 20
18 def _specific_get_meta(self, full_path, file_path): 21 def _specific_get_meta(self, full_path, file_path):
19 if file_path != 'OEBPS/content.opf': 22 if file_path != 'OEBPS/content.opf':
@@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
25 f.read(), re.I|re.M) 28 f.read(), re.I|re.M)
26 return {k:v for (k, v) in results} 29 return {k:v for (k, v) in results}
27 except (TypeError, UnicodeDecodeError): 30 except (TypeError, UnicodeDecodeError):
28 # We didn't manage to parse the xml file
29 return {file_path: 'harmful content', } 31 return {file_path: 'harmful content', }
30 32
31 def _specific_cleanup(self, full_path: str): 33 def _specific_cleanup(self, full_path: str):
32 if not full_path.endswith('OEBPS/content.opf'): 34 if full_path.endswith('OEBPS/content.opf'):
33 return True 35 return self.__handle_contentopf(full_path)
36 elif full_path.endswith('OEBPS/toc.ncx'):
37 return self.__handle_tocncx(full_path)
38 return True
39
40 def __handle_tocncx(self, full_path: str):
41 try:
42 tree, namespace = office._parse_xml(full_path)
43 except ET.ParseError: # pragma: nocover
44 logging.error("Unable to parse %s in %s.", full_path, self.filename)
45 return False
46
47 for item in tree.iterfind('.//', namespace): # pragma: nocover
48 if item.tag.strip().lower().endswith('head'):
49 item.clear()
50 ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
51 break
52 tree.write(full_path, xml_declaration=True, encoding='utf-8',
53 short_empty_elements=False)
54 return True
34 55
56 def __handle_contentopf(self, full_path: str):
35 try: 57 try:
36 tree, namespace = office._parse_xml(full_path) 58 tree, namespace = office._parse_xml(full_path)
37 except ET.ParseError: 59 except ET.ParseError:
38 logging.error("Unable to parse %s in %s.", full_path, self.filename) 60 logging.error("Unable to parse %s in %s.", full_path, self.filename)
39 return False 61 return False
40 parent_map = {c:p for p in tree.iter() for c in p}
41 62
42 for item in tree.iterfind('.//', namespace): 63 for item in tree.iterfind('.//', namespace): # pragma: nocover
43 if item.tag.strip().lower().endswith('metadata'): 64 if item.tag.strip().lower().endswith('metadata'):
44 parent_map[item].remove(item) 65 item.clear()
66
67 # item with mandatory content
68 uniqid = ET.Element(self.metadata_namespace + 'identifier')
69 uniqid.text = str(self.uniqid)
70 uniqid.set('id', 'id')
71 item.append(uniqid)
72
73 # items without mandatory content
74 for name in {'language', 'title'}:
75 uniqid = ET.Element(self.metadata_namespace + name)
76 item.append(uniqid)
45 break # there is only a single <metadata> block 77 break # there is only a single <metadata> block
46 tree.write(full_path, xml_declaration=True) 78 tree.write(full_path, xml_declaration=True, encoding='utf-8')
47 return True 79 return True
diff --git a/libmat2/web.py b/libmat2/web.py
index c11b47d..067f5f9 100644
--- a/libmat2/web.py
+++ b/libmat2/web.py
@@ -1,10 +1,13 @@
1from html import parser 1from html import parser, escape
2from typing import Dict, Any, List, Tuple 2from typing import Dict, Any, List, Tuple, Set
3import re 3import re
4import string 4import string
5 5
6from . import abstract 6from . import abstract
7 7
8assert Set
9
10# pylint: disable=too-many-instance-attributes
8 11
9class CSSParser(abstract.AbstractParser): 12class CSSParser(abstract.AbstractParser):
10 """There is no such things as metadata in CSS files, 13 """There is no such things as metadata in CSS files,
@@ -33,11 +36,16 @@ class CSSParser(abstract.AbstractParser):
33 return metadata 36 return metadata
34 37
35 38
36class HTMLParser(abstract.AbstractParser): 39class AbstractHTMLParser(abstract.AbstractParser):
37 mimetypes = {'text/html', 'application/x-dtbncx+xml', } 40 tags_blacklist = set() # type: Set[str]
41 # In some html/xml based formats some tags are mandatory,
42 # so we're keeping them, but are discaring their contents
43 tags_required_blacklist = set() # type: Set[str]
44
38 def __init__(self, filename): 45 def __init__(self, filename):
39 super().__init__(filename) 46 super().__init__(filename)
40 self.__parser = _HTMLParser(self.filename) 47 self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
48 self.tags_required_blacklist)
41 with open(filename, encoding='utf-8') as f: 49 with open(filename, encoding='utf-8') as f:
42 self.__parser.feed(f.read()) 50 self.__parser.feed(f.read())
43 self.__parser.close() 51 self.__parser.close()
@@ -49,29 +57,50 @@ class HTMLParser(abstract.AbstractParser):
49 return self.__parser.remove_all(self.output_filename) 57 return self.__parser.remove_all(self.output_filename)
50 58
51 59
60class HTMLParser(AbstractHTMLParser):
61 mimetypes = {'text/html', }
62 tags_blacklist = {'meta', }
63 tags_required_blacklist = {'title', }
64
65
66class DTBNCXParser(AbstractHTMLParser):
67 mimetypes = {'application/x-dtbncx+xml', }
68 tags_required_blacklist = {'title', 'doctitle', 'meta'}
69
70
52class _HTMLParser(parser.HTMLParser): 71class _HTMLParser(parser.HTMLParser):
53 """Python doesn't have a validating html parser in its stdlib, so 72 """Python doesn't have a validating html parser in its stdlib, so
54 we're using an internal queue to track all the opening/closing tags, 73 we're using an internal queue to track all the opening/closing tags,
55 and hoping for the best. 74 and hoping for the best.
56 """ 75 """
57 tag_blacklist = {'doctitle', 'meta', 'title'} # everything is lowercase 76 def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
58 def __init__(self, filename):
59 super().__init__() 77 super().__init__()
60 self.filename = filename 78 self.filename = filename
61 self.__textrepr = '' 79 self.__textrepr = ''
62 self.__meta = {} 80 self.__meta = {}
63 self.__validation_queue = [] 81 self.__validation_queue = [] # type: List[str]
64 # We're using a counter instead of a boolean to handle nested tags 82 # We're using counters instead of booleans, to handle nested tags
83 self.__in_dangerous_but_required_tag = 0
65 self.__in_dangerous_tag = 0 84 self.__in_dangerous_tag = 0
66 85
86 if required_blacklisted_tags & blacklisted_tags: # pragma: nocover
87 raise ValueError("There is an overlap between %s and %s" % (
88 required_blacklisted_tags, blacklisted_tags))
89 self.tag_required_blacklist = required_blacklisted_tags
90 self.tag_blacklist = blacklisted_tags
91
67 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): 92 def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
68 self.__validation_queue.append(tag) 93 original_tag = self.get_starttag_text()
94 self.__validation_queue.append(original_tag)
95
96 if tag in self.tag_required_blacklist:
97 self.__in_dangerous_but_required_tag += 1
69 if tag in self.tag_blacklist: 98 if tag in self.tag_blacklist:
70 self.__in_dangerous_tag += 1 99 self.__in_dangerous_tag += 1
71 return
72 100
73 if self.__in_dangerous_tag == 0: 101 if self.__in_dangerous_tag == 0:
74 self.__textrepr += self.get_starttag_text() 102 if self.__in_dangerous_but_required_tag <= 1:
103 self.__textrepr += original_tag
75 104
76 def handle_endtag(self, tag: str): 105 def handle_endtag(self, tag: str):
77 if not self.__validation_queue: 106 if not self.__validation_queue:
@@ -79,29 +108,43 @@ class _HTMLParser(parser.HTMLParser):
79 "opening one in %s." % (tag, self.filename)) 108 "opening one in %s." % (tag, self.filename))
80 109
81 previous_tag = self.__validation_queue.pop() 110 previous_tag = self.__validation_queue.pop()
82 if tag != previous_tag: 111 previous_tag = previous_tag[1:-1] # remove < and >
112 previous_tag = previous_tag.split(' ')[0] # remove attributes
113 if tag != previous_tag.lower():
83 raise ValueError("The closing tag %s doesn't match the previous " 114 raise ValueError("The closing tag %s doesn't match the previous "
84 "tag %s in %s" % 115 "tag %s in %s" %
85 (tag, previous_tag, self.filename)) 116 (tag, previous_tag, self.filename))
86 elif tag in self.tag_blacklist:
87 self.__in_dangerous_tag -= 1
88 return
89 117
90 if self.__in_dangerous_tag == 0: 118 if self.__in_dangerous_tag == 0:
91 # There is no `get_endtag_text()` method :/ 119 if self.__in_dangerous_but_required_tag <= 1:
92 self.__textrepr += '</' + tag + '>\n' 120 # There is no `get_endtag_text()` method :/
121 self.__textrepr += '</' + previous_tag + '>'
122
123 if tag in self.tag_required_blacklist:
124 self.__in_dangerous_but_required_tag -= 1
125 elif tag in self.tag_blacklist:
126 self.__in_dangerous_tag -= 1
93 127
94 def handle_data(self, data: str): 128 def handle_data(self, data: str):
95 if self.__in_dangerous_tag == 0 and data.strip(): 129 if self.__in_dangerous_but_required_tag == 0:
96 self.__textrepr += data 130 if self.__in_dangerous_tag == 0:
131 if data.strip():
132 self.__textrepr += escape(data)
97 133
98 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): 134 def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
99 if tag in self.tag_blacklist: 135 if tag in self.tag_required_blacklist | self.tag_blacklist:
100 meta = {k:v for k, v in attrs} 136 meta = {k:v for k, v in attrs}
101 name = meta.get('name', 'harmful metadata') 137 name = meta.get('name', 'harmful metadata')
102 content = meta.get('content', 'harmful data') 138 content = meta.get('content', 'harmful data')
103 self.__meta[name] = content 139 self.__meta[name] = content
104 else: 140
141 if self.__in_dangerous_tag != 0:
142 return
143 elif tag in self.tag_required_blacklist:
144 self.__textrepr += '<' + tag + ' />'
145 return
146
147 if self.__in_dangerous_but_required_tag == 0:
105 if self.__in_dangerous_tag == 0: 148 if self.__in_dangerous_tag == 0:
106 self.__textrepr += self.get_starttag_text() 149 self.__textrepr += self.get_starttag_text()
107 150
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 53c856a..b2cec00 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -253,13 +253,13 @@ class TestCorruptedFiles(unittest.TestCase):
253 os.remove('./tests/data/clean.cleaned.html') 253 os.remove('./tests/data/clean.cleaned.html')
254 254
255 with open('./tests/data/clean.html', 'w') as f: 255 with open('./tests/data/clean.html', 'w') as f:
256 f.write('</close>') 256 f.write('</meta>')
257 with self.assertRaises(ValueError): 257 with self.assertRaises(ValueError):
258 web.HTMLParser('./tests/data/clean.html') 258 web.HTMLParser('./tests/data/clean.html')
259 os.remove('./tests/data/clean.html') 259 os.remove('./tests/data/clean.html')
260 260
261 with open('./tests/data/clean.html', 'w') as f: 261 with open('./tests/data/clean.html', 'w') as f:
262 f.write('<notclosed>') 262 f.write('<meta><a>test</a><set/></meta><title></title><meta>')
263 p = web.HTMLParser('./tests/data/clean.html') 263 p = web.HTMLParser('./tests/data/clean.html')
264 with self.assertRaises(ValueError): 264 with self.assertRaises(ValueError):
265 p.get_meta() 265 p.get_meta()
@@ -269,6 +269,9 @@ class TestCorruptedFiles(unittest.TestCase):
269 os.remove('./tests/data/clean.html') 269 os.remove('./tests/data/clean.html')
270 270
271 with open('./tests/data/clean.html', 'w') as f: 271 with open('./tests/data/clean.html', 'w') as f:
272 f.write('<meta><meta/></meta>')
273 f.write('<title><title>pouet</title></title>')
274 f.write('<title><mysupertag/></title>')
272 f.write('<doctitle><br/></doctitle><br/><notclosed>') 275 f.write('<doctitle><br/></doctitle><br/><notclosed>')
273 p = web.HTMLParser('./tests/data/clean.html') 276 p = web.HTMLParser('./tests/data/clean.html')
274 with self.assertRaises(ValueError): 277 with self.assertRaises(ValueError):
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 249c56d..f4b1890 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -3,6 +3,7 @@
3import unittest 3import unittest
4import shutil 4import shutil
5import os 5import os
6import re
6import zipfile 7import zipfile
7 8
8from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless 9from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
@@ -644,7 +645,10 @@ class TestCleaning(unittest.TestCase):
644 self.assertTrue(ret) 645 self.assertTrue(ret)
645 646
646 p = epub.EPUBParser('./tests/data/clean.cleaned.epub') 647 p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
647 self.assertEqual(p.get_meta(), {}) 648 meta = p.get_meta()
649 res = re.match(meta['OEBPS/content.opf']['metadata'], '^<dc:identifier>[0-9a-f-]+</dc:identifier><dc:title /><dc:language />$')
650 self.assertNotEqual(res, False)
651
648 self.assertTrue(p.remove_all()) 652 self.assertTrue(p.remove_all())
649 653
650 os.remove('./tests/data/clean.epub') 654 os.remove('./tests/data/clean.epub')