summaryrefslogtreecommitdiff
path: root/libmat/office.py
diff options
context:
space:
mode:
Diffstat (limited to 'libmat/office.py')
-rw-r--r--libmat/office.py55
1 files changed, 29 insertions, 26 deletions
diff --git a/libmat/office.py b/libmat/office.py
index d020c46..bd4bd97 100644
--- a/libmat/office.py
+++ b/libmat/office.py
@@ -1,6 +1,6 @@
1''' Care about office's formats 1""" Care about office's formats
2 2
3''' 3"""
4 4
5import logging 5import logging
6import os 6import os
@@ -21,14 +21,14 @@ import archive
21 21
22 22
23class OpenDocumentStripper(archive.TerminalZipStripper): 23class OpenDocumentStripper(archive.TerminalZipStripper):
24 ''' An open document file is a zip, with xml file into. 24 """ An open document file is a zip, with xml file into.
25 The one that interest us is meta.xml 25 The one that interest us is meta.xml
26 ''' 26 """
27 27
28 def get_meta(self): 28 def get_meta(self):
29 ''' Return a dict with all the meta of the file by 29 """ Return a dict with all the meta of the file by
30 trying to read the meta.xml file. 30 trying to read the meta.xml file.
31 ''' 31 """
32 metadata = super(OpenDocumentStripper, self).get_meta() 32 metadata = super(OpenDocumentStripper, self).get_meta()
33 zipin = zipfile.ZipFile(self.filename, 'r') 33 zipin = zipfile.ZipFile(self.filename, 'r')
34 try: 34 try:
@@ -49,13 +49,13 @@ class OpenDocumentStripper(archive.TerminalZipStripper):
49 return metadata 49 return metadata
50 50
51 def remove_all(self): 51 def remove_all(self):
52 ''' Removes metadata 52 """ Removes metadata
53 ''' 53 """
54 return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml']) 54 return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
55 55
56 def is_clean(self): 56 def is_clean(self):
57 ''' Check if the file is clean from harmful metadatas 57 """ Check if the file is clean from harmful metadatas
58 ''' 58 """
59 clean_super = super(OpenDocumentStripper, self).is_clean() 59 clean_super = super(OpenDocumentStripper, self).is_clean()
60 if clean_super is False: 60 if clean_super is False:
61 return False 61 return False
@@ -70,20 +70,21 @@ class OpenDocumentStripper(archive.TerminalZipStripper):
70 70
71 71
72class OpenXmlStripper(archive.TerminalZipStripper): 72class OpenXmlStripper(archive.TerminalZipStripper):
73 ''' Represent an office openxml document, which is like 73 """ Represent an office openxml document, which is like
74 an opendocument format, with some tricky stuff added. 74 an opendocument format, with some tricky stuff added.
75 It contains mostly xml, but can have media blobs, crap, ... 75 It contains mostly xml, but can have media blobs, crap, ...
76 (I don't like this format.) 76 (I don't like this format.)
77 ''' 77 """
78
78 def remove_all(self): 79 def remove_all(self):
79 return super(OpenXmlStripper, self).remove_all( 80 return super(OpenXmlStripper, self).remove_all(
80 beginning_blacklist=('docProps/'), whitelist=('.rels')) 81 beginning_blacklist='docProps/', whitelist='.rels')
81 82
82 def is_clean(self): 83 def is_clean(self):
83 ''' Check if the file is clean from harmful metadatas. 84 """ Check if the file is clean from harmful metadatas.
84 This implementation is faster than something like 85 This implementation is faster than something like
85 "return this.get_meta() == {}". 86 "return this.get_meta() == {}".
86 ''' 87 """
87 clean_super = super(OpenXmlStripper, self).is_clean() 88 clean_super = super(OpenXmlStripper, self).is_clean()
88 if clean_super is False: 89 if clean_super is False:
89 return False 90 return False
@@ -96,8 +97,8 @@ class OpenXmlStripper(archive.TerminalZipStripper):
96 return True 97 return True
97 98
98 def get_meta(self): 99 def get_meta(self):
99 ''' Return a dict with all the meta of the file 100 """ Return a dict with all the meta of the file
100 ''' 101 """
101 metadata = super(OpenXmlStripper, self).get_meta() 102 metadata = super(OpenXmlStripper, self).get_meta()
102 103
103 zipin = zipfile.ZipFile(self.filename, 'r') 104 zipin = zipfile.ZipFile(self.filename, 'r')
@@ -109,8 +110,9 @@ class OpenXmlStripper(archive.TerminalZipStripper):
109 110
110 111
111class PdfStripper(parser.GenericParser): 112class PdfStripper(parser.GenericParser):
112 ''' Represent a PDF file 113 """ Represent a PDF file
113 ''' 114 """
115
114 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs): 116 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
115 super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs) 117 super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
116 self.uri = 'file://' + os.path.abspath(self.filename) 118 self.uri = 'file://' + os.path.abspath(self.filename)
@@ -121,16 +123,16 @@ class PdfStripper(parser.GenericParser):
121 self.pdf_quality = False 123 self.pdf_quality = False
122 124
123 self.meta_list = frozenset(['title', 'author', 'subject', 125 self.meta_list = frozenset(['title', 'author', 'subject',
124 'keywords', 'creator', 'producer', 'metadata']) 126 'keywords', 'creator', 'producer', 'metadata'])
125 127
126 def is_clean(self): 128 def is_clean(self):
127 ''' Check if the file is clean from harmful metadatas 129 """ Check if the file is clean from harmful metadatas
128 ''' 130 """
129 document = Poppler.Document.new_from_file(self.uri, self.password) 131 document = Poppler.Document.new_from_file(self.uri, self.password)
130 return not any(document.get_property(key) for key in self.meta_list) 132 return not any(document.get_property(key) for key in self.meta_list)
131 133
132 def remove_all(self): 134 def remove_all(self):
133 ''' Opening the PDF with poppler, then doing a render 135 """ Opening the PDF with poppler, then doing a render
134 on a cairo pdfsurface for each pages. 136 on a cairo pdfsurface for each pages.
135 137
136 http://cairographics.org/documentation/pycairo/2/ 138 http://cairographics.org/documentation/pycairo/2/
@@ -138,7 +140,7 @@ class PdfStripper(parser.GenericParser):
138 The use of an intermediate tempfile is necessary because 140 The use of an intermediate tempfile is necessary because
139 python-cairo segfaults on unicode. 141 python-cairo segfaults on unicode.
140 See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=699457 142 See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=699457
141 ''' 143 """
142 document = Poppler.Document.new_from_file(self.uri, self.password) 144 document = Poppler.Document.new_from_file(self.uri, self.password)
143 try: 145 try:
144 output = tempfile.mkstemp()[1] 146 output = tempfile.mkstemp()[1]
@@ -169,6 +171,7 @@ class PdfStripper(parser.GenericParser):
169 171
170 try: 172 try:
171 import pdfrw # For now, poppler cannot write meta, so we must use pdfrw 173 import pdfrw # For now, poppler cannot write meta, so we must use pdfrw
174
172 logging.debug('Removing %s\'s superficial metadata' % self.filename) 175 logging.debug('Removing %s\'s superficial metadata' % self.filename)
173 trailer = pdfrw.PdfReader(self.output) 176 trailer = pdfrw.PdfReader(self.output)
174 trailer.Info.Producer = None 177 trailer.Info.Producer = None
@@ -183,8 +186,8 @@ class PdfStripper(parser.GenericParser):
183 return True 186 return True
184 187
185 def get_meta(self): 188 def get_meta(self):
186 ''' Return a dict with all the meta of the file 189 """ Return a dict with all the meta of the file
187 ''' 190 """
188 document = Poppler.Document.new_from_file(self.uri, self.password) 191 document = Poppler.Document.new_from_file(self.uri, self.password)
189 metadata = {} 192 metadata = {}
190 for key in self.meta_list: 193 for key in self.meta_list: