From 807248f9343a4cabb48c3be1a512b27f6377e871 Mon Sep 17 00:00:00 2001
From: Julien Voisin
Date: Tue, 3 Mar 2015 15:58:59 +0100
Subject: First commit!

---
 modules/levenshtein.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 modules/levenshtein.py

(limited to 'modules/levenshtein.py')

diff --git a/modules/levenshtein.py b/modules/levenshtein.py
new file mode 100644
index 0000000..2e854e2
--- /dev/null
+++ b/modules/levenshtein.py
@@ -0,0 +1,73 @@
+'''
+This modules has a super-awful complexity (something along n^4),
+so I'm quite sure that you don't want to run it by default ;)
+
+Anyway, this modules computes the Levenshtein distance between samples of malwares
+and files to check, to find similarities.
+'''
+import os
+
+import scanmodule
+
+def main():
+    return Levenshtein()
+
+class Levenshtein(scanmodule.ScanModule):
+    name = 'levenshtein'
+    def populate(self, path):
+        ''' We can't really populate the database with Levenshtein scores,
+        but we can speedup the calculation by storing files lenghts
+        '''
+        for root, _, filenames in os.walk(path):
+            for filename in filenames:
+                full_path = os.path.join(root, filename)
+                with open(full_path, 'r') as f:
+                    self.samples[full_path] = [os.path.getsize(full_path), f.read().lower()]
+
+    def evaluate(self, path):
+        ''' Compare the hash of the given path to every samples one.
+        @ret A sorted list of the form [name, match_in_percent_superior_to_zero]
+        '''
+        file_to_test = path
+        file_size = os.path.getsize(file_to_test)
+
+        lst = list()
+        for sample_name, sample_intel in self.samples.iteritems():
+            if sample_name != file_to_test:
+                score = self.__levenshtein(file_to_test, sample_intel[1])
+                score = score / ((file_size + sample_intel[0]) / 2.0)  # mean value
+                if score > 25:  # if the match is under 10%, we don't care
+                    lst.append([sample_name, score * 10])
+        return sorted(lst, key=lambda lst: lst[1], reverse=True)
+
+    def __levenshtein_file(self, f, b):
+        ''' Computes the Levenshtein's distance between a file and a buffer
+        @param f1 File
+        @param fs2 Buffer
+        @return The levenshtein distance
+        '''
+        with open(f, 'r') as of:
+            return self.__levenshtein(of.read().lower(), b)
+
+    def __levenshtein(self, s1, s2):
+        ''' Computes the Levenshtein's distance between two strings
+        @param s1 First string
+        @param s2 Second string
+        @return The levenshtein distance
+        '''
+
+        if len(s1) < len(s2):  # Minimize computation
+            s1, s2 = s2, s1
+
+        previous_row = range(len(s2) + 1)
+        for i, c1 in enumerate(s1):
+            current_row = [i + 1]
+            for j, c2 in enumerate(s2):
+                insertions = previous_row[j + 1] + 1
+                deletions = current_row[j] + 1
+                substitutions = previous_row[j] + (c1 != c2)
+                current_row.append(min(insertions, deletions, substitutions))
+            previous_row = current_row
+
+        return previous_row[-1]
+
-- 
cgit v1.3