summaryrefslogtreecommitdiff
path: root/modules/libfuzzy.py
blob: a0d3f15a3e2329def7539ba9c3e3511e3c6c2461 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
''' Ugly-pseudo-bindings to libfuzzy (used by ssdeep) to check
if a file is similar to a given list of samples
'''

import os
import ctypes
import pickle
import sys
import logging
logging.basicConfig(level=logging.DEBUG)

import scanmodule


SPAMSUM_LENGTH = 64
FUZZY_MAX_RESULT = SPAMSUM_LENGTH + SPAMSUM_LENGTH // 2 + 20

def main():
    return FuzzyMatcher()

class FuzzyMatcher(scanmodule.ScanModule):
    name = 'libfuzzy'
    def __init__(self, samples_path=None, persistence_path=None):
        self.__initialize_libfuzzy()
        super(FuzzyMatcher, self).__init__()

    def __initialize_libfuzzy(self):
        ''' Bind to libfuzzy thanks to ctypes.
        This will create the "fuzzy_hash_buf" and
        the "fuzzy_compare" methods
        '''
        try:
            fuzzy = ctypes.CDLL('libfuzzy.so')
        except OSError:
            print('[-] Please check that you installed libfuzzy')
            sys.exit(1)

        self.__fuzzy_hash_buf = fuzzy.fuzzy_hash_buf
        self.__fuzzy_hash_buf.restype = ctypes.c_int
        self.__fuzzy_hash_buf.argtypes = [
            ctypes.c_char_p, #buf
            ctypes.c_uint32, #buf_len
            ctypes.c_char_p, #result
        ]
        self.__fuzzy_compare = fuzzy.fuzzy_compare
        self.__fuzzy_compare.restype = ctypes.c_int
        self.__fuzzy_compare.argtypes = [
            ctypes.c_char_p, #sig1
            ctypes.c_char_p, #sig2
        ]

    def populate(self, path):
        ''' Computes fuzzy hashes of files under the given path,
        and store them in the dict self.samples with the form dict {name: fuzzy_hash}
        @param path Path containing the samples
        '''
        for root, _, filenames in os.walk(path):
            for filename in filenames:
                full_path = os.path.join(root, filename)
                self.samples[full_path] = self.__hash_from_file(full_path)

    def __hash_from_file(self, path):
        ''' Return the hash of the given file
        @param path Path to the file to hash
        @ret Fuzzy hash of the given file
        '''
        with open(path, 'r') as f:
            out = ctypes.create_string_buffer('\x00' * FUZZY_MAX_RESULT)
            content = f.read()
            self.__fuzzy_hash_buf(content, len(content), out)
            return out.value

    def evaluate(self, path):
        ''' Compare the hash of the given path to every samples one.
        @ret A sorted list of the form [name, match_in_percent_superior_to_zero]
        '''
        fuzzy_hash = self.__hash_from_file(path)

        lst = list()
        for f in self.samples:
            score = self.__fuzzy_compare(fuzzy_hash, self.samples[f])
            if score:
                lst.append([f, score])
        return sorted(lst, key=lambda lst: lst[1], reverse=True)

    def is_malware(self, path):
        max_score = 0
        fuzzy_hash = self.__hash_from_file(path)

        for f in self.samples:
            score = self.__fuzzy_compare(fuzzy_hash, self.samples[f])
            if score > max_score:
                score = max_score
                logging.info('fuzzy score for ' + path + ' matches ' + f + ' at ' + str(score) + '%%')

        return max_score > 90