diff options
| author | jvoisin | 2020-02-08 16:08:32 +0100 |
|---|---|---|
| committer | jvoisin | 2020-02-08 17:00:37 +0100 |
| commit | 5270071b94ba3a1353ed30323df5802222d2b277 (patch) | |
| tree | 16d2b7ca1c452005ea39f79af8181b087e19f922 /libmat2/pdf.py | |
| parent | 5312603a88411d8e65cc25387fd0e783a8878e5c (diff) | |
Remove a couple of residual metadata in pdf
This commit takes care of removing residual metadata
added by mat2 during the cleaning of pdf.
Diffstat (limited to 'libmat2/pdf.py')
| -rw-r--r-- | libmat2/pdf.py | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/libmat2/pdf.py b/libmat2/pdf.py index 547e071..2fe2b6b 100644 --- a/libmat2/pdf.py +++ b/libmat2/pdf.py | |||
| @@ -122,6 +122,17 @@ class PDFParser(abstract.AbstractParser): | |||
| 122 | document.set_creator('') | 122 | document.set_creator('') |
| 123 | document.set_creation_date(-1) | 123 | document.set_creation_date(-1) |
| 124 | document.save('file://' + os.path.abspath(out_file)) | 124 | document.save('file://' + os.path.abspath(out_file)) |
| 125 | |||
| 126 | # Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes | ||
| 127 | # fails to remove them, we have to use this terrible regex. | ||
| 128 | # It should(tm) be alright though, because cairo's output format | ||
| 129 | # for metadata is fixed. | ||
| 130 | with open(out_file, 'rb') as f: | ||
| 131 | out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0, | ||
| 132 | re.DOTALL | re.IGNORECASE) | ||
| 133 | with open(out_file, 'wb') as f: | ||
| 134 | f.write(out) | ||
| 135 | |||
| 125 | return True | 136 | return True |
| 126 | 137 | ||
| 127 | @staticmethod | 138 | @staticmethod |
