From 807248f9343a4cabb48c3be1a512b27f6377e871 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Tue, 3 Mar 2015 15:58:59 +0100 Subject: First commit! --- modules/levenshtein.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 modules/levenshtein.py (limited to 'modules/levenshtein.py') diff --git a/modules/levenshtein.py b/modules/levenshtein.py new file mode 100644 index 0000000..2e854e2 --- /dev/null +++ b/modules/levenshtein.py @@ -0,0 +1,73 @@ +''' +This modules has a super-awful complexity (something along n^4), +so I'm quite sure that you don't want to run it by default ;) + +Anyway, this modules computes the Levenshtein distance between samples of malwares +and files to check, to find similarities. +''' +import os + +import scanmodule + +def main(): + return Levenshtein() + +class Levenshtein(scanmodule.ScanModule): + name = 'levenshtein' + def populate(self, path): + ''' We can't really populate the database with Levenshtein scores, + but we can speedup the calculation by storing files lenghts + ''' + for root, _, filenames in os.walk(path): + for filename in filenames: + full_path = os.path.join(root, filename) + with open(full_path, 'r') as f: + self.samples[full_path] = [os.path.getsize(full_path), f.read().lower()] + + def evaluate(self, path): + ''' Compare the hash of the given path to every samples one. + @ret A sorted list of the form [name, match_in_percent_superior_to_zero] + ''' + file_to_test = path + file_size = os.path.getsize(file_to_test) + + lst = list() + for sample_name, sample_intel in self.samples.iteritems(): + if sample_name != file_to_test: + score = self.__levenshtein(file_to_test, sample_intel[1]) + score = score / ((file_size + sample_intel[0]) / 2.0) # mean value + if score > 25: # if the match is under 10%, we don't care + lst.append([sample_name, score * 10]) + return sorted(lst, key=lambda lst: lst[1], reverse=True) + + def __levenshtein_file(self, f, b): + ''' Computes the Levenshtein's distance between a file and a buffer + @param f1 File + @param fs2 Buffer + @return The levenshtein distance + ''' + with open(f, 'r') as of: + return self.__levenshtein(of.read().lower(), b) + + def __levenshtein(self, s1, s2): + ''' Computes the Levenshtein's distance between two strings + @param s1 First string + @param s2 Second string + @return The levenshtein distance + ''' + + if len(s1) < len(s2): # Minimize computation + s1, s2 = s2, s1 + + previous_row = range(len(s2) + 1) + for i, c1 in enumerate(s1): + current_row = [i + 1] + for j, c2 in enumerate(s2): + insertions = previous_row[j + 1] + 1 + deletions = current_row[j] + 1 + substitutions = previous_row[j] + (c1 != c2) + current_row.append(min(insertions, deletions, substitutions)) + previous_row = current_row + + return previous_row[-1] + -- cgit v1.3