''' This modules has a super-awful complexity (something along n^4), so I'm quite sure that you don't want to run it by default ;) Anyway, this modules computes the Levenshtein distance between samples of malwares and files to check, to find similarities. ''' import os import scanmodule def main(): return Levenshtein() class Levenshtein(scanmodule.ScanModule): name = 'levenshtein' def populate(self, path): ''' We can't really populate the database with Levenshtein scores, but we can speedup the calculation by storing files lenghts ''' for root, _, filenames in os.walk(path): for filename in filenames: full_path = os.path.join(root, filename) with open(full_path, 'r') as f: self.samples[full_path] = [os.path.getsize(full_path), f.read().lower()] def evaluate(self, path): ''' Compare the hash of the given path to every samples one. @ret A sorted list of the form [name, match_in_percent_superior_to_zero] ''' file_to_test = path file_size = os.path.getsize(file_to_test) lst = list() for sample_name, sample_intel in self.samples.iteritems(): if sample_name != file_to_test: score = self.__levenshtein(file_to_test, sample_intel[1]) score = score / ((file_size + sample_intel[0]) / 2.0) # mean value if score > 25: # if the match is under 10%, we don't care lst.append([sample_name, score * 10]) return sorted(lst, key=lambda lst: lst[1], reverse=True) def __levenshtein_file(self, f, b): ''' Computes the Levenshtein's distance between a file and a buffer @param f1 File @param fs2 Buffer @return The levenshtein distance ''' with open(f, 'r') as of: return self.__levenshtein(of.read().lower(), b) def __levenshtein(self, s1, s2): ''' Computes the Levenshtein's distance between two strings @param s1 First string @param s2 Second string @return The levenshtein distance ''' if len(s1) < len(s2): # Minimize computation s1, s2 = s2, s1 previous_row = range(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1]