summaryrefslogtreecommitdiff
path: root/libmat/office.py
diff options
context:
space:
mode:
authorjvoisin2014-06-08 13:39:18 +0200
committerjvoisin2014-06-08 13:39:18 +0200
commitaf36529554c39a2eefcc2c8723715e2d25b401b8 (patch)
treef54b964520bab44d1dfac725086211eaf22d3763 /libmat/office.py
parentef5a32cfd3c0555ffe5ddf413eeaae61622ebb4b (diff)
Rename the MAT folder to libmat.
This commit fixes some issues for dump operating systems who doesn't handle capitalization.
Diffstat (limited to 'libmat/office.py')
-rw-r--r--libmat/office.py191
1 files changed, 191 insertions, 0 deletions
diff --git a/libmat/office.py b/libmat/office.py
new file mode 100644
index 0000000..0ca1ff1
--- /dev/null
+++ b/libmat/office.py
@@ -0,0 +1,191 @@
1''' Care about office's formats
2
3'''
4
5import logging
6import os
7import shutil
8import tempfile
9import xml.dom.minidom as minidom
10import zipfile
11
12try:
13 import cairo
14 from gi.repository import Poppler
15except ImportError:
16 logging.info('office.py loaded without PDF support')
17 pass
18
19import parser
20import archive
21
22
23class OpenDocumentStripper(archive.TerminalZipStripper):
24 ''' An open document file is a zip, with xml file into.
25 The one that interest us is meta.xml
26 '''
27
28 def get_meta(self):
29 ''' Return a dict with all the meta of the file by
30 trying to read the meta.xml file.
31 '''
32 metadata = super(OpenDocumentStripper, self).get_meta()
33 zipin = zipfile.ZipFile(self.filename, 'r')
34 try:
35 content = zipin.read('meta.xml')
36 dom1 = minidom.parseString(content)
37 elements = dom1.getElementsByTagName('office:meta')
38 for i in elements[0].childNodes:
39 if i.tagName != 'meta:document-statistic':
40 nodename = ''.join(i.nodeName.split(':')[1:])
41 metadata[nodename] = ''.join([j.data for j in i.childNodes])
42 else:
43 # thank you w3c for not providing a nice
44 # method to get all attributes of a node
45 pass
46 except KeyError: # no meta.xml file found
47 logging.debug('%s has no opendocument metadata' % self.filename)
48 zipin.close()
49 return metadata
50
51 def remove_all(self):
52 ''' Removes metadata
53 '''
54 return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
55
56 def is_clean(self):
57 ''' Check if the file is clean from harmful metadatas
58 '''
59 clean_super = super(OpenDocumentStripper, self).is_clean()
60 if clean_super is False:
61 return False
62
63 zipin = zipfile.ZipFile(self.filename, 'r')
64 try:
65 zipin.getinfo('meta.xml')
66 except KeyError: # no meta.xml in the file
67 return True
68 zipin.close()
69 return False
70
71
72class OpenXmlStripper(archive.TerminalZipStripper):
73 ''' Represent an office openxml document, which is like
74 an opendocument format, with some tricky stuff added.
75 It contains mostly xml, but can have media blobs, crap, ...
76 (I don't like this format.)
77 '''
78 def remove_all(self):
79 return super(OpenXmlStripper, self).remove_all(
80 beginning_blacklist=('docProps/'), whitelist=('.rels'))
81
82 def is_clean(self):
83 ''' Check if the file is clean from harmful metadatas.
84 This implementation is faster than something like
85 "return this.get_meta() == {}".
86 '''
87 clean_super = super(OpenXmlStripper, self).is_clean()
88 if clean_super is False:
89 return False
90
91 zipin = zipfile.ZipFile(self.filename, 'r')
92 for item in zipin.namelist():
93 if item.startswith('docProps/'):
94 return False
95 zipin.close()
96 return True
97
98 def get_meta(self):
99 ''' Return a dict with all the meta of the file
100 '''
101 metadata = super(OpenXmlStripper, self).get_meta()
102
103 zipin = zipfile.ZipFile(self.filename, 'r')
104 for item in zipin.namelist():
105 if item.startswith('docProps/'):
106 metadata[item] = 'harmful content'
107 zipin.close()
108 return metadata
109
110
111class PdfStripper(parser.GenericParser):
112 ''' Represent a PDF file
113 '''
114 def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
115 super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
116 self.uri = 'file://' + os.path.abspath(self.filename)
117 self.password = None
118 try:
119 self.pdf_quality = kwargs['low_pdf_quality']
120 except KeyError:
121 self.pdf_quality = False
122
123 self.meta_list = frozenset(['title', 'author', 'subject',
124 'keywords', 'creator', 'producer', 'metadata'])
125
126 def is_clean(self):
127 ''' Check if the file is clean from harmful metadatas
128 '''
129 document = Poppler.Document.new_from_file(self.uri, self.password)
130 for key in self.meta_list:
131 if document.get_property(key):
132 return False
133 return True
134
135 def remove_all(self):
136 ''' Opening the PDF with poppler, then doing a render
137 on a cairo pdfsurface for each pages.
138
139 http://cairographics.org/documentation/pycairo/2/
140
141 The use of an intermediate tempfile is necessary because
142 python-cairo segfaults on unicode.
143 See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=699457
144 '''
145 document = Poppler.Document.new_from_file(self.uri, self.password)
146 try:
147 output = tempfile.mkstemp()[1]
148 page = document.get_page(0)
149 # assume that every pages are the same size
150 page_width, page_height = page.get_size()
151 surface = cairo.PDFSurface(output, page_width, page_height)
152 context = cairo.Context(surface) # context draws on the surface
153 logging.debug('PDF rendering of %s' % self.filename)
154 for pagenum in range(document.get_n_pages()):
155 page = document.get_page(pagenum)
156 context.translate(0, 0)
157 if self.pdf_quality:
158 page.render(context) # render the page on context
159 else:
160 page.render_for_printing(context) # render the page on context
161 context.show_page() # draw context on surface
162 surface.finish()
163 shutil.move(output, self.output)
164 except:
165 logging.error('Something went wrong when cleaning %s.' % self.filename)
166 return False
167
168 try:
169 import pdfrw # For now, poppler cannot write meta, so we must use pdfrw
170 logging.debug('Removing %s\'s superficial metadata' % self.filename)
171 trailer = pdfrw.PdfReader(self.output)
172 trailer.Info.Producer = None
173 trailer.Info.Creator = None
174 writer = pdfrw.PdfWriter()
175 writer.trailer = trailer
176 writer.write(self.output)
177 self.do_backup()
178 except:
179 logging.error('Unable to remove all metadata from %s, please install pdfrw' % self.output)
180 return False
181 return True
182
183 def get_meta(self):
184 ''' Return a dict with all the meta of the file
185 '''
186 document = Poppler.Document.new_from_file(self.uri, self.password)
187 metadata = {}
188 for key in self.meta_list:
189 if document.get_property(key):
190 metadata[key] = document.get_property(key)
191 return metadata