diff options
Diffstat (limited to 'modules/levenshtein.py')
| -rw-r--r-- | modules/levenshtein.py | 73 |
1 files changed, 0 insertions, 73 deletions
diff --git a/modules/levenshtein.py b/modules/levenshtein.py deleted file mode 100644 index 2e854e2..0000000 --- a/modules/levenshtein.py +++ /dev/null | |||
| @@ -1,73 +0,0 @@ | |||
| 1 | ''' | ||
| 2 | This modules has a super-awful complexity (something along n^4), | ||
| 3 | so I'm quite sure that you don't want to run it by default ;) | ||
| 4 | |||
| 5 | Anyway, this modules computes the Levenshtein distance between samples of malwares | ||
| 6 | and files to check, to find similarities. | ||
| 7 | ''' | ||
| 8 | import os | ||
| 9 | |||
| 10 | import scanmodule | ||
| 11 | |||
| 12 | def main(): | ||
| 13 | return Levenshtein() | ||
| 14 | |||
| 15 | class Levenshtein(scanmodule.ScanModule): | ||
| 16 | name = 'levenshtein' | ||
| 17 | def populate(self, path): | ||
| 18 | ''' We can't really populate the database with Levenshtein scores, | ||
| 19 | but we can speedup the calculation by storing files lenghts | ||
| 20 | ''' | ||
| 21 | for root, _, filenames in os.walk(path): | ||
| 22 | for filename in filenames: | ||
| 23 | full_path = os.path.join(root, filename) | ||
| 24 | with open(full_path, 'r') as f: | ||
| 25 | self.samples[full_path] = [os.path.getsize(full_path), f.read().lower()] | ||
| 26 | |||
| 27 | def evaluate(self, path): | ||
| 28 | ''' Compare the hash of the given path to every samples one. | ||
| 29 | @ret A sorted list of the form [name, match_in_percent_superior_to_zero] | ||
| 30 | ''' | ||
| 31 | file_to_test = path | ||
| 32 | file_size = os.path.getsize(file_to_test) | ||
| 33 | |||
| 34 | lst = list() | ||
| 35 | for sample_name, sample_intel in self.samples.iteritems(): | ||
| 36 | if sample_name != file_to_test: | ||
| 37 | score = self.__levenshtein(file_to_test, sample_intel[1]) | ||
| 38 | score = score / ((file_size + sample_intel[0]) / 2.0) # mean value | ||
| 39 | if score > 25: # if the match is under 10%, we don't care | ||
| 40 | lst.append([sample_name, score * 10]) | ||
| 41 | return sorted(lst, key=lambda lst: lst[1], reverse=True) | ||
| 42 | |||
| 43 | def __levenshtein_file(self, f, b): | ||
| 44 | ''' Computes the Levenshtein's distance between a file and a buffer | ||
| 45 | @param f1 File | ||
| 46 | @param fs2 Buffer | ||
| 47 | @return The levenshtein distance | ||
| 48 | ''' | ||
| 49 | with open(f, 'r') as of: | ||
| 50 | return self.__levenshtein(of.read().lower(), b) | ||
| 51 | |||
| 52 | def __levenshtein(self, s1, s2): | ||
| 53 | ''' Computes the Levenshtein's distance between two strings | ||
| 54 | @param s1 First string | ||
| 55 | @param s2 Second string | ||
| 56 | @return The levenshtein distance | ||
| 57 | ''' | ||
| 58 | |||
| 59 | if len(s1) < len(s2): # Minimize computation | ||
| 60 | s1, s2 = s2, s1 | ||
| 61 | |||
| 62 | previous_row = range(len(s2) + 1) | ||
| 63 | for i, c1 in enumerate(s1): | ||
| 64 | current_row = [i + 1] | ||
| 65 | for j, c2 in enumerate(s2): | ||
| 66 | insertions = previous_row[j + 1] + 1 | ||
| 67 | deletions = current_row[j] + 1 | ||
| 68 | substitutions = previous_row[j] + (c1 != c2) | ||
| 69 | current_row.append(min(insertions, deletions, substitutions)) | ||
| 70 | previous_row = current_row | ||
| 71 | |||
| 72 | return previous_row[-1] | ||
| 73 | |||
