1 files changed, 29 insertions, 26 deletions
diff --git a/libmat/office.py b/libmat/office.py
index d020c46..bd4bd97 100644
--- a/libmat/office.py
+++ b/libmat/office.py
@@ -1,6 +1,6 @@
-''' Care about office's formats
+""" Care about office's formats
-'''
+"""
 import logging
 import os
@@ -21,14 +21,14 @@ import archive
 class OpenDocumentStripper(archive.TerminalZipStripper):
-    ''' An open document file is a zip, with xml file into.
+    """ An open document file is a zip, with xml file into.
        The one that interest us is meta.xml
-    '''
+    """
    def get_meta(self):
-        ''' Return a dict with all the meta of the file by
+        """ Return a dict with all the meta of the file by
            trying to read the meta.xml file.
-        '''
+        """
        metadata = super(OpenDocumentStripper, self).get_meta()
        zipin = zipfile.ZipFile(self.filename, 'r')
        try:
@@ -49,13 +49,13 @@ class OpenDocumentStripper(archive.TerminalZipStripper):
        return metadata
    def remove_all(self):
-        ''' Removes metadata
+        """ Removes metadata
-        '''
+        """
        return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
    def is_clean(self):
-        ''' Check if the file is clean from harmful metadatas
+        """ Check if the file is clean from harmful metadatas
-        '''
+        """
        clean_super = super(OpenDocumentStripper, self).is_clean()
        if clean_super is False:
            return False
@@ -70,20 +70,21 @@ class OpenDocumentStripper(archive.TerminalZipStripper):
 class OpenXmlStripper(archive.TerminalZipStripper):
-    ''' Represent an office openxml document, which is like
+    """ Represent an office openxml document, which is like
        an opendocument format, with some tricky stuff added.
        It contains mostly xml, but can have media blobs, crap, ...
        (I don't like this format.)
-    '''
+    """
    def remove_all(self):
        return super(OpenXmlStripper, self).remove_all(
-                beginning_blacklist=('docProps/'), whitelist=('.rels'))
+            beginning_blacklist='docProps/', whitelist='.rels')
    def is_clean(self):
-        ''' Check if the file is clean from harmful metadatas.
+        """ Check if the file is clean from harmful metadatas.
            This implementation is faster than something like
            "return this.get_meta() == {}".
-        '''
+        """
        clean_super = super(OpenXmlStripper, self).is_clean()
        if clean_super is False:
            return False
@@ -96,8 +97,8 @@ class OpenXmlStripper(archive.TerminalZipStripper):
        return True
    def get_meta(self):
-        ''' Return a dict with all the meta of the file
+        """ Return a dict with all the meta of the file
-        '''
+        """
        metadata = super(OpenXmlStripper, self).get_meta()
        zipin = zipfile.ZipFile(self.filename, 'r')
@@ -109,8 +110,9 @@ class OpenXmlStripper(archive.TerminalZipStripper):
 class PdfStripper(parser.GenericParser):
-    ''' Represent a PDF file
+    """ Represent a PDF file
-    '''
+    """
    def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
        super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
        self.uri = 'file://' + os.path.abspath(self.filename)
@@ -121,16 +123,16 @@ class PdfStripper(parser.GenericParser):
            self.pdf_quality = False
        self.meta_list = frozenset(['title', 'author', 'subject',
-            'keywords', 'creator', 'producer', 'metadata'])
+                                    'keywords', 'creator', 'producer', 'metadata'])
    def is_clean(self):
-        ''' Check if the file is clean from harmful metadatas
+        """ Check if the file is clean from harmful metadatas
-        '''
+        """
        document = Poppler.Document.new_from_file(self.uri, self.password)
        return not any(document.get_property(key) for key in self.meta_list)
    def remove_all(self):
-        ''' Opening the PDF with poppler, then doing a render
+        """ Opening the PDF with poppler, then doing a render
            on a cairo pdfsurface for each pages.
            http://cairographics.org/documentation/pycairo/2/
@@ -138,7 +140,7 @@ class PdfStripper(parser.GenericParser):
            The use of an intermediate tempfile is necessary because
            python-cairo segfaults on unicode.
            See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=699457
-        '''
+        """
        document = Poppler.Document.new_from_file(self.uri, self.password)
        try:
            output = tempfile.mkstemp()[1]
@@ -169,6 +171,7 @@ class PdfStripper(parser.GenericParser):
        try:
            import pdfrw  # For now, poppler cannot write meta, so we must use pdfrw
            logging.debug('Removing %s\'s superficial metadata' % self.filename)
            trailer = pdfrw.PdfReader(self.output)
            trailer.Info.Producer = None
@@ -183,8 +186,8 @@ class PdfStripper(parser.GenericParser):
        return True
    def get_meta(self):
-        ''' Return a dict with all the meta of the file
+        """ Return a dict with all the meta of the file
-        '''
+        """
        document = Poppler.Document.new_from_file(self.uri, self.password)
        metadata = {}
        for key in self.meta_list:

diff --git a/libmat/office.py b/libmat/office.py index d020c46..bd4bd97 100644 --- a/libmat/office.py +++ b/libmat/office.py
@@ -1,6 +1,6 @@
1	''' Care about office's formats	1	""" Care about office's formats
2		2
3	'''	3	"""
4		4
5	import logging	5	import logging
6	import os	6	import os
@@ -21,14 +21,14 @@ import archive
21		21
22		22
23	class OpenDocumentStripper(archive.TerminalZipStripper):	23	class OpenDocumentStripper(archive.TerminalZipStripper):
24	''' An open document file is a zip, with xml file into.	24	""" An open document file is a zip, with xml file into.
25	The one that interest us is meta.xml	25	The one that interest us is meta.xml
26	'''	26	"""
27		27
28	def get_meta(self):	28	def get_meta(self):
29	''' Return a dict with all the meta of the file by	29	""" Return a dict with all the meta of the file by
30	trying to read the meta.xml file.	30	trying to read the meta.xml file.
31	'''	31	"""
32	metadata = super(OpenDocumentStripper, self).get_meta()	32	metadata = super(OpenDocumentStripper, self).get_meta()
33	zipin = zipfile.ZipFile(self.filename, 'r')	33	zipin = zipfile.ZipFile(self.filename, 'r')
34	try:	34	try:
@@ -49,13 +49,13 @@ class OpenDocumentStripper(archive.TerminalZipStripper):
49	return metadata	49	return metadata
50		50
51	def remove_all(self):	51	def remove_all(self):
52	''' Removes metadata	52	""" Removes metadata
53	'''	53	"""
54	return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])	54	return super(OpenDocumentStripper, self).remove_all(ending_blacklist=['meta.xml'])
55		55
56	def is_clean(self):	56	def is_clean(self):
57	''' Check if the file is clean from harmful metadatas	57	""" Check if the file is clean from harmful metadatas
58	'''	58	"""
59	clean_super = super(OpenDocumentStripper, self).is_clean()	59	clean_super = super(OpenDocumentStripper, self).is_clean()
60	if clean_super is False:	60	if clean_super is False:
61	return False	61	return False
@@ -70,20 +70,21 @@ class OpenDocumentStripper(archive.TerminalZipStripper):
70		70
71		71
72	class OpenXmlStripper(archive.TerminalZipStripper):	72	class OpenXmlStripper(archive.TerminalZipStripper):
73	''' Represent an office openxml document, which is like	73	""" Represent an office openxml document, which is like
74	an opendocument format, with some tricky stuff added.	74	an opendocument format, with some tricky stuff added.
75	It contains mostly xml, but can have media blobs, crap, ...	75	It contains mostly xml, but can have media blobs, crap, ...
76	(I don't like this format.)	76	(I don't like this format.)
77	'''	77	"""
		78
78	def remove_all(self):	79	def remove_all(self):
79	return super(OpenXmlStripper, self).remove_all(	80	return super(OpenXmlStripper, self).remove_all(
80	beginning_blacklist=('docProps/'), whitelist=('.rels'))	81	beginning_blacklist='docProps/', whitelist='.rels')
81		82
82	def is_clean(self):	83	def is_clean(self):
83	''' Check if the file is clean from harmful metadatas.	84	""" Check if the file is clean from harmful metadatas.
84	This implementation is faster than something like	85	This implementation is faster than something like
85	"return this.get_meta() == {}".	86	"return this.get_meta() == {}".
86	'''	87	"""
87	clean_super = super(OpenXmlStripper, self).is_clean()	88	clean_super = super(OpenXmlStripper, self).is_clean()
88	if clean_super is False:	89	if clean_super is False:
89	return False	90	return False
@@ -96,8 +97,8 @@ class OpenXmlStripper(archive.TerminalZipStripper):
96	return True	97	return True
97		98
98	def get_meta(self):	99	def get_meta(self):
99	''' Return a dict with all the meta of the file	100	""" Return a dict with all the meta of the file
100	'''	101	"""
101	metadata = super(OpenXmlStripper, self).get_meta()	102	metadata = super(OpenXmlStripper, self).get_meta()
102		103
103	zipin = zipfile.ZipFile(self.filename, 'r')	104	zipin = zipfile.ZipFile(self.filename, 'r')
@@ -109,8 +110,9 @@ class OpenXmlStripper(archive.TerminalZipStripper):
109		110
110		111
111	class PdfStripper(parser.GenericParser):	112	class PdfStripper(parser.GenericParser):
112	''' Represent a PDF file	113	""" Represent a PDF file
113	'''	114	"""
		115
114	def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):	116	def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
115	super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)	117	super(PdfStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
116	self.uri = 'file://' + os.path.abspath(self.filename)	118	self.uri = 'file://' + os.path.abspath(self.filename)
@@ -121,16 +123,16 @@ class PdfStripper(parser.GenericParser):
121	self.pdf_quality = False	123	self.pdf_quality = False
122		124
123	self.meta_list = frozenset(['title', 'author', 'subject',	125	self.meta_list = frozenset(['title', 'author', 'subject',
124	'keywords', 'creator', 'producer', 'metadata'])	126	'keywords', 'creator', 'producer', 'metadata'])
125		127
126	def is_clean(self):	128	def is_clean(self):
127	''' Check if the file is clean from harmful metadatas	129	""" Check if the file is clean from harmful metadatas
128	'''	130	"""
129	document = Poppler.Document.new_from_file(self.uri, self.password)	131	document = Poppler.Document.new_from_file(self.uri, self.password)
130	return not any(document.get_property(key) for key in self.meta_list)	132	return not any(document.get_property(key) for key in self.meta_list)
131		133
132	def remove_all(self):	134	def remove_all(self):
133	''' Opening the PDF with poppler, then doing a render	135	""" Opening the PDF with poppler, then doing a render
134	on a cairo pdfsurface for each pages.	136	on a cairo pdfsurface for each pages.
135		137
136	http://cairographics.org/documentation/pycairo/2/	138	http://cairographics.org/documentation/pycairo/2/
@@ -138,7 +140,7 @@ class PdfStripper(parser.GenericParser):
138	The use of an intermediate tempfile is necessary because	140	The use of an intermediate tempfile is necessary because
139	python-cairo segfaults on unicode.	141	python-cairo segfaults on unicode.
140	See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=699457	142	See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=699457
141	'''	143	"""
142	document = Poppler.Document.new_from_file(self.uri, self.password)	144	document = Poppler.Document.new_from_file(self.uri, self.password)
143	try:	145	try:
144	output = tempfile.mkstemp()[1]	146	output = tempfile.mkstemp()[1]
@@ -169,6 +171,7 @@ class PdfStripper(parser.GenericParser):
169		171
170	try:	172	try:
171	import pdfrw # For now, poppler cannot write meta, so we must use pdfrw	173	import pdfrw # For now, poppler cannot write meta, so we must use pdfrw
		174
172	logging.debug('Removing %s\'s superficial metadata' % self.filename)	175	logging.debug('Removing %s\'s superficial metadata' % self.filename)
173	trailer = pdfrw.PdfReader(self.output)	176	trailer = pdfrw.PdfReader(self.output)
174	trailer.Info.Producer = None	177	trailer.Info.Producer = None
@@ -183,8 +186,8 @@ class PdfStripper(parser.GenericParser):
183	return True	186	return True
184		187
185	def get_meta(self):	188	def get_meta(self):
186	''' Return a dict with all the meta of the file	189	""" Return a dict with all the meta of the file
187	'''	190	"""
188	document = Poppler.Document.new_from_file(self.uri, self.password)	191	document = Poppler.Document.new_from_file(self.uri, self.password)
189	metadata = {}	192	metadata = {}
190	for key in self.meta_list:	193	for key in self.meta_list: