From 6beeeebe3c43f0643e521139d3f8b1ff4a7f3059 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Thu, 5 Mar 2015 15:36:22 +0100 Subject: Yara is cooler than Python --- modules/levenshtein.py | 73 -------------------------------------------------- 1 file changed, 73 deletions(-) delete mode 100644 modules/levenshtein.py (limited to 'modules/levenshtein.py') diff --git a/modules/levenshtein.py b/modules/levenshtein.py deleted file mode 100644 index 2e854e2..0000000 --- a/modules/levenshtein.py +++ /dev/null @@ -1,73 +0,0 @@ -''' -This modules has a super-awful complexity (something along n^4), -so I'm quite sure that you don't want to run it by default ;) - -Anyway, this modules computes the Levenshtein distance between samples of malwares -and files to check, to find similarities. -''' -import os - -import scanmodule - -def main(): - return Levenshtein() - -class Levenshtein(scanmodule.ScanModule): - name = 'levenshtein' - def populate(self, path): - ''' We can't really populate the database with Levenshtein scores, - but we can speedup the calculation by storing files lenghts - ''' - for root, _, filenames in os.walk(path): - for filename in filenames: - full_path = os.path.join(root, filename) - with open(full_path, 'r') as f: - self.samples[full_path] = [os.path.getsize(full_path), f.read().lower()] - - def evaluate(self, path): - ''' Compare the hash of the given path to every samples one. - @ret A sorted list of the form [name, match_in_percent_superior_to_zero] - ''' - file_to_test = path - file_size = os.path.getsize(file_to_test) - - lst = list() - for sample_name, sample_intel in self.samples.iteritems(): - if sample_name != file_to_test: - score = self.__levenshtein(file_to_test, sample_intel[1]) - score = score / ((file_size + sample_intel[0]) / 2.0) # mean value - if score > 25: # if the match is under 10%, we don't care - lst.append([sample_name, score * 10]) - return sorted(lst, key=lambda lst: lst[1], reverse=True) - - def __levenshtein_file(self, f, b): - ''' Computes the Levenshtein's distance between a file and a buffer - @param f1 File - @param fs2 Buffer - @return The levenshtein distance - ''' - with open(f, 'r') as of: - return self.__levenshtein(of.read().lower(), b) - - def __levenshtein(self, s1, s2): - ''' Computes the Levenshtein's distance between two strings - @param s1 First string - @param s2 Second string - @return The levenshtein distance - ''' - - if len(s1) < len(s2): # Minimize computation - s1, s2 = s2, s1 - - previous_row = range(len(s2) + 1) - for i, c1 in enumerate(s1): - current_row = [i + 1] - for j, c2 in enumerate(s2): - insertions = previous_row[j + 1] + 1 - deletions = current_row[j] + 1 - substitutions = previous_row[j] + (c1 != c2) - current_row.append(min(insertions, deletions, substitutions)) - previous_row = current_row - - return previous_row[-1] - -- cgit v1.3