Yara is cooler than Python

author: Julien Voisin 2015-03-05 15:36:22 +0100
committer: Julien Voisin 2015-03-05 15:36:22 +0100
commit: 6beeeebe3c43f0643e521139d3f8b1ff4a7f3059 (patch)
tree: 72de2c9e6f8eb30b847da44213b8482f98691589 /modules/levenshtein.py
parent: 1c917ed43a58e1c1c77ccd0815b6e95fbcca54ff (diff)
1 files changed, 0 insertions, 73 deletions
diff --git a/modules/levenshtein.py b/modules/levenshtein.py
deleted file mode 100644
index 2e854e2..0000000
--- a/modules/levenshtein.py
+++ /dev/null
@@ -1,73 +0,0 @@
-'''
-This modules has a super-awful complexity (something along n^4),
-so I'm quite sure that you don't want to run it by default ;)
-Anyway, this modules computes the Levenshtein distance between samples of malwares
-and files to check, to find similarities.
-'''
-import os
-import scanmodule
-def main():
-    return Levenshtein()
-class Levenshtein(scanmodule.ScanModule):
-    name = 'levenshtein'
-    def populate(self, path):
-        ''' We can't really populate the database with Levenshtein scores,
-        but we can speedup the calculation by storing files lenghts
-        '''
-        for root, _, filenames in os.walk(path):
-            for filename in filenames:
-                full_path = os.path.join(root, filename)
-                with open(full_path, 'r') as f:
-                    self.samples[full_path] = [os.path.getsize(full_path), f.read().lower()]
-    def evaluate(self, path):
-        ''' Compare the hash of the given path to every samples one.
-        @ret A sorted list of the form [name, match_in_percent_superior_to_zero]
-        '''
-        file_to_test = path
-        file_size = os.path.getsize(file_to_test)
-        lst = list()
-        for sample_name, sample_intel in self.samples.iteritems():
-            if sample_name != file_to_test:
-                score = self.__levenshtein(file_to_test, sample_intel[1])
-                score = score / ((file_size + sample_intel[0]) / 2.0)  # mean value
-                if score > 25:  # if the match is under 10%, we don't care
-                    lst.append([sample_name, score * 10])
-        return sorted(lst, key=lambda lst: lst[1], reverse=True)
-    def __levenshtein_file(self, f, b):
-        ''' Computes the Levenshtein's distance between a file and a buffer
-        @param f1 File
-        @param fs2 Buffer
-        @return The levenshtein distance
-        '''
-        with open(f, 'r') as of:
-            return self.__levenshtein(of.read().lower(), b)
-    def __levenshtein(self, s1, s2):
-        ''' Computes the Levenshtein's distance between two strings
-        @param s1 First string
-        @param s2 Second string
-        @return The levenshtein distance
-        '''
-        if len(s1) < len(s2):  # Minimize computation
-            s1, s2 = s2, s1
-        previous_row = range(len(s2) + 1)
-        for i, c1 in enumerate(s1):
-            current_row = [i + 1]
-            for j, c2 in enumerate(s2):
-                insertions = previous_row[j + 1] + 1
-                deletions = current_row[j] + 1
-                substitutions = previous_row[j] + (c1 != c2)
-                current_row.append(min(insertions, deletions, substitutions))
-            previous_row = current_row
-        return previous_row[-1]
author	Julien Voisin	2015-03-05 15:36:22 +0100
committer	Julien Voisin	2015-03-05 15:36:22 +0100
commit	6beeeebe3c43f0643e521139d3f8b1ff4a7f3059 (patch)
tree	72de2c9e6f8eb30b847da44213b8482f98691589 /modules/levenshtein.py
parent	1c917ed43a58e1c1c77ccd0815b6e95fbcca54ff (diff)

diff --git a/modules/levenshtein.py b/modules/levenshtein.py deleted file mode 100644 index 2e854e2..0000000 --- a/modules/levenshtein.py +++ /dev/null
@@ -1,73 +0,0 @@
1	'''
2	This modules has a super-awful complexity (something along n^4),
3	so I'm quite sure that you don't want to run it by default ;)
4
5	Anyway, this modules computes the Levenshtein distance between samples of malwares
6	and files to check, to find similarities.
7	'''
8	import os
9
10	import scanmodule
11
12	def main():
13	return Levenshtein()
14
15	class Levenshtein(scanmodule.ScanModule):
16	name = 'levenshtein'
17	def populate(self, path):
18	''' We can't really populate the database with Levenshtein scores,
19	but we can speedup the calculation by storing files lenghts
20	'''
21	for root, _, filenames in os.walk(path):
22	for filename in filenames:
23	full_path = os.path.join(root, filename)
24	with open(full_path, 'r') as f:
25	self.samples[full_path] = [os.path.getsize(full_path), f.read().lower()]
26
27	def evaluate(self, path):
28	''' Compare the hash of the given path to every samples one.
29	@ret A sorted list of the form [name, match_in_percent_superior_to_zero]
30	'''
31	file_to_test = path
32	file_size = os.path.getsize(file_to_test)
33
34	lst = list()
35	for sample_name, sample_intel in self.samples.iteritems():
36	if sample_name != file_to_test:
37	score = self.__levenshtein(file_to_test, sample_intel[1])
38	score = score / ((file_size + sample_intel[0]) / 2.0) # mean value
39	if score > 25: # if the match is under 10%, we don't care
40	lst.append([sample_name, score * 10])
41	return sorted(lst, key=lambda lst: lst[1], reverse=True)
42
43	def __levenshtein_file(self, f, b):
44	''' Computes the Levenshtein's distance between a file and a buffer
45	@param f1 File
46	@param fs2 Buffer
47	@return The levenshtein distance
48	'''
49	with open(f, 'r') as of:
50	return self.__levenshtein(of.read().lower(), b)
51
52	def __levenshtein(self, s1, s2):
53	''' Computes the Levenshtein's distance between two strings
54	@param s1 First string
55	@param s2 Second string
56	@return The levenshtein distance
57	'''
58
59	if len(s1) < len(s2): # Minimize computation
60	s1, s2 = s2, s1
61
62	previous_row = range(len(s2) + 1)
63	for i, c1 in enumerate(s1):
64	current_row = [i + 1]
65	for j, c2 in enumerate(s2):
66	insertions = previous_row[j + 1] + 1
67	deletions = current_row[j] + 1
68	substitutions = previous_row[j] + (c1 != c2)
69	current_row.append(min(insertions, deletions, substitutions))
70	previous_row = current_row
71
72	return previous_row[-1]
73