modules/levenshtein.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

'''
This modules has a super-awful complexity (something along n^4),
so I'm quite sure that you don't want to run it by default ;)

Anyway, this modules computes the Levenshtein distance between samples of malwares
and files to check, to find similarities.
'''
import os

import scanmodule

def main():
    return Levenshtein()

class Levenshtein(scanmodule.ScanModule):
    name = 'levenshtein'
    def populate(self, path):
        ''' We can't really populate the database with Levenshtein scores,
        but we can speedup the calculation by storing files lenghts
        '''
        for root, _, filenames in os.walk(path):
            for filename in filenames:
                full_path = os.path.join(root, filename)
                with open(full_path, 'r') as f:
                    self.samples[full_path] = [os.path.getsize(full_path), f.read().lower()]

    def evaluate(self, path):
        ''' Compare the hash of the given path to every samples one.
        @ret A sorted list of the form [name, match_in_percent_superior_to_zero]
        '''
        file_to_test = path
        file_size = os.path.getsize(file_to_test)

        lst = list()
        for sample_name, sample_intel in self.samples.iteritems():
            if sample_name != file_to_test:
                score = self.__levenshtein(file_to_test, sample_intel[1])
                score = score / ((file_size + sample_intel[0]) / 2.0)  # mean value
                if score > 25:  # if the match is under 10%, we don't care
                    lst.append([sample_name, score * 10])
        return sorted(lst, key=lambda lst: lst[1], reverse=True)

    def __levenshtein_file(self, f, b):
        ''' Computes the Levenshtein's distance between a file and a buffer
        @param f1 File
        @param fs2 Buffer
        @return The levenshtein distance
        '''
        with open(f, 'r') as of:
            return self.__levenshtein(of.read().lower(), b)

    def __levenshtein(self, s1, s2):
        ''' Computes the Levenshtein's distance between two strings
        @param s1 First string
        @param s2 Second string
        @return The levenshtein distance
        '''

        if len(s1) < len(s2):  # Minimize computation
            s1, s2 = s2, s1

        previous_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row

        return previous_row[-1]