summaryrefslogtreecommitdiff
path: root/modules/levenshtein.py
diff options
context:
space:
mode:
authorJulien Voisin2015-03-05 15:36:22 +0100
committerJulien Voisin2015-03-05 15:36:22 +0100
commit6beeeebe3c43f0643e521139d3f8b1ff4a7f3059 (patch)
tree72de2c9e6f8eb30b847da44213b8482f98691589 /modules/levenshtein.py
parent1c917ed43a58e1c1c77ccd0815b6e95fbcca54ff (diff)
Yara is cooler than Python
Diffstat (limited to 'modules/levenshtein.py')
-rw-r--r--modules/levenshtein.py73
1 files changed, 0 insertions, 73 deletions
diff --git a/modules/levenshtein.py b/modules/levenshtein.py
deleted file mode 100644
index 2e854e2..0000000
--- a/modules/levenshtein.py
+++ /dev/null
@@ -1,73 +0,0 @@
1'''
2This modules has a super-awful complexity (something along n^4),
3so I'm quite sure that you don't want to run it by default ;)
4
5Anyway, this modules computes the Levenshtein distance between samples of malwares
6and files to check, to find similarities.
7'''
8import os
9
10import scanmodule
11
12def main():
13 return Levenshtein()
14
15class Levenshtein(scanmodule.ScanModule):
16 name = 'levenshtein'
17 def populate(self, path):
18 ''' We can't really populate the database with Levenshtein scores,
19 but we can speedup the calculation by storing files lenghts
20 '''
21 for root, _, filenames in os.walk(path):
22 for filename in filenames:
23 full_path = os.path.join(root, filename)
24 with open(full_path, 'r') as f:
25 self.samples[full_path] = [os.path.getsize(full_path), f.read().lower()]
26
27 def evaluate(self, path):
28 ''' Compare the hash of the given path to every samples one.
29 @ret A sorted list of the form [name, match_in_percent_superior_to_zero]
30 '''
31 file_to_test = path
32 file_size = os.path.getsize(file_to_test)
33
34 lst = list()
35 for sample_name, sample_intel in self.samples.iteritems():
36 if sample_name != file_to_test:
37 score = self.__levenshtein(file_to_test, sample_intel[1])
38 score = score / ((file_size + sample_intel[0]) / 2.0) # mean value
39 if score > 25: # if the match is under 10%, we don't care
40 lst.append([sample_name, score * 10])
41 return sorted(lst, key=lambda lst: lst[1], reverse=True)
42
43 def __levenshtein_file(self, f, b):
44 ''' Computes the Levenshtein's distance between a file and a buffer
45 @param f1 File
46 @param fs2 Buffer
47 @return The levenshtein distance
48 '''
49 with open(f, 'r') as of:
50 return self.__levenshtein(of.read().lower(), b)
51
52 def __levenshtein(self, s1, s2):
53 ''' Computes the Levenshtein's distance between two strings
54 @param s1 First string
55 @param s2 Second string
56 @return The levenshtein distance
57 '''
58
59 if len(s1) < len(s2): # Minimize computation
60 s1, s2 = s2, s1
61
62 previous_row = range(len(s2) + 1)
63 for i, c1 in enumerate(s1):
64 current_row = [i + 1]
65 for j, c2 in enumerate(s2):
66 insertions = previous_row[j + 1] + 1
67 deletions = current_row[j] + 1
68 substitutions = previous_row[j] + (c1 != c2)
69 current_row.append(min(insertions, deletions, substitutions))
70 previous_row = current_row
71
72 return previous_row[-1]
73