regex for file extensions
[NeoPI/dkf.git] / neopi.py
blob07aa90840ecc82faa38b725999af6c09f82e5a2b
1 #!/usr/bin/python
2 # Name: neopi.py
3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
7 # Date: 11/4/2010
8 # Copyright: Neohapsis Open Source blah Blah
11 # Try catch regular expressions/bad path/bad filename/bad regex/
13 # Library imports
14 import math
15 import sys
16 import os
17 import re
18 import zlib
19 import csv
20 from collections import defaultdict
21 from optparse import OptionParser
23 class LanguageIC:
24 """Class that calculates a file's Index of Coincidence as
25 as well as a a subset of files average Index of Coincidence.
26 """
27 def __init__(self):
28 """Initialize results arrays as well as character counters."""
29 self.char_count = defaultdict(int)
30 self.total_char_count = 0
31 self.ic_results = []
32 self.ic_total_results = ""
34 def caculate_char_count(self,data):
35 """Method to calculate character counts for a particular data file."""
36 if not data:
37 return 0
39 for x in range(256):
40 char = chr(x)
41 charcount = data.count(char)
42 self.char_count[char] += charcount
43 self.total_char_count += charcount
45 return
47 def caculate_IC(self):
48 """Calculate the Index of Coincidence for the self variables"""
49 total = 0
50 for val in self.char_count.values():
52 if val == 0:
53 continue
54 total += val * (val-1)
56 try:
57 ic_total = float(total)/(self.total_char_count * (self.total_char_count - 1))
58 except:
59 ic_total = 0
60 self.ic_total_results = ic_total
61 return
63 def caculate(self,data,filename):
64 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
65 if not data:
66 return 0
67 char_count = 0
68 total_char_count = 0
70 for x in range(256):
71 char = chr(x)
72 charcount = data.count(char)
73 char_count += charcount * (charcount - 1)
74 total_char_count += charcount
76 ic = float(char_count)/(total_char_count * (total_char_count - 1))
77 self.ic_results.append({"filename":filename, "IC":ic})
78 # Call method to caculate_char_count and append to total_char_count
79 self.caculate_char_count(data)
80 return ic
82 def printer(self):
83 """Print the average IC for searchpath and the top 10 lowest Index of Coincidence files."""
84 self.ic_results.sort(key=lambda item: item["IC"])
85 top_ten = self.ic_results[0:10]
86 # Calculate the Total IC for a Search
87 self.caculate_IC()
88 ic_list = []
89 print ""
90 print "[[ Average IC for Search ]]"
91 print self.ic_total_results
92 print ""
93 print "[[ Top 10 IC files ]]"
94 x = 9
95 for file in top_ten:
96 print ' {0:>7.4f} {1}'.format(file["IC"], file["filename"])
97 results = file["filename"], x
98 ic_list.append(results)
99 x = x - 1
100 return ic_list
102 class Entropy:
103 """Class that calculates a file's Entropy."""
105 def __init__(self):
106 """Instantiate the entropy_results array."""
107 self.entropy_results = []
109 def caculate(self,data,filename):
110 """Calculate the entropy for 'data' and append result to entropy_results array."""
112 if not data:
113 return 0
114 entropy = 0
115 for x in range(256):
116 p_x = float(data.count(chr(x)))/len(data)
117 if p_x > 0:
118 entropy += - p_x * math.log(p_x, 2)
119 self.entropy_results.append({"filename":filename, "entropy":entropy})
120 return entropy
122 def printer(self):
123 """Print the top 10 entropic files for a given search"""
124 self.entropy_results.sort(key=lambda item: item["entropy"])
125 top_ten = self.entropy_results[-10:]
126 top_ten.reverse()
127 entropy_list = []
129 print ""
130 print "[[ Top 10 entropic files ]]"
131 x = 9
132 for file in top_ten:
133 print ' {0:>7.4f} {1}'.format(file["entropy"], file["filename"])
134 results = file["filename"], x
135 entropy_list.append(results)
136 x = x - 1
137 return entropy_list
139 class LongestWord:
140 """Class that determines the longest word for a particular file."""
141 def __init__(self):
142 """Instantiate the longestword_results array."""
143 self.longestword_results = []
145 def caculate(self,data,filename):
146 """Find the longest word in a string and append to longestword_results array"""
148 if not data:
149 return "", 0
151 longest = 0
152 longest_word = ""
153 words = re.split("[\s,\n,\r]", data)
154 if words:
155 for word in words:
156 length = len(word)
157 if length > longest:
158 longest = length
159 longest_word = word
160 self.longestword_results.append({"filename":filename, "wordlongest":longest})
161 return longest
163 def printer(self):
164 """Print the top 10 longest word files for a given search"""
165 self.longestword_results.sort(key=lambda item: item["wordlongest"])
166 top_ten = self.longestword_results[-10:]
167 top_ten.reverse()
168 longestword_list = []
170 print ""
171 print "[[ Top 10 longest word files ]]"
172 x = 9
173 for file in top_ten:
174 print ' {0:>7} {1}'.format(file["wordlongest"], file["filename"])
175 results = file["filename"], x
176 longestword_list.append(results)
177 x = x - 1
178 return longestword_list
180 class SearchFile:
181 """Generator that searches a given filepath with an optional regular
182 expression and returns the filepath and filename"""
183 def search_file_path(self, args, valid_regex):
184 for root, dirs, files in os.walk(args[0]):
185 for file in files:
186 filename = os.path.join(root, file)
187 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
188 try:
189 data = open(root + "/" + file, 'rb').read()
190 except:
191 data = False
192 print "Could not read file :: %s/%s" % (root, file)
193 yield data, filename
194 class PrintRank:
195 """bob"""
196 def print_rank(self, top_ten):
198 files = defaultdict(int)
199 for list in top_ten:
200 for file, rank in list:
201 files[str(file)] += int(rank)
203 sorted_top_ten = sorted(files.items(), key=lambda k: k[1], reverse=True)
204 top_ten = sorted_top_ten[0:10]
205 print "[[ Highest Rank Files Based on test results ]]"
206 # print ' {0:>7} {1}'.format("Rank", "Filename")
208 for file in top_ten:
209 #print file[0], "%" +
210 print ' {0:>7} {1}'.format(str(int((float(file[1])/30) * 100)) + "%", file[0])
212 return
214 if __name__ == "__main__":
215 """Parse all the options"""
216 parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
217 version="%prog 1.0")
218 parser.add_option("-C", "--csv",
219 action="store",
220 dest="is_csv",
221 default=False,
222 help="generate CSV outfile",
223 metavar="FILECSV")
224 parser.add_option("-a", "--all",
225 action="store_true",
226 dest="is_all",
227 default=False,
228 help="Run all tests [Entropy, Longest Word, Compression]",)
229 parser.add_option("-e", "--entropy",
230 action="store_true",
231 dest="is_entropy",
232 default=False,
233 help="Run entropy Test",)
234 parser.add_option("-l", "--longestword",
235 action="store_true",
236 dest="is_longest",
237 default=False,
238 help="Run longest word test",)
239 parser.add_option("-c", "--ic",
240 action="store_true",
241 dest="is_ic",
242 default=False,
243 help="Run IC test",)
244 parser.add_option("-A", "--auto",
245 action="store_true",
246 dest="is_auto",
247 default=False,
248 help="Run auto file extension tests",)
250 (options, args) = parser.parse_args()
252 # Error on invalid number of arguements
253 if len(args) < 1:
254 parser.error("wrong number of arguments")
256 # Error on an invalid path
257 if os.path.exists(args[0]) == False:
258 parser.error("invalid path")
260 valid_regex = ""
261 if (len(args) == 2 and options.is_auto is False):
262 valid_regex = re.compile(args[1])
263 else:
264 valid_regex = re.compile('.*')
265 tests = []
267 if options.is_auto:
268 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.sh|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm)$')
270 if options.is_all:
271 tests.append(LanguageIC())
272 tests.append(Entropy())
273 tests.append(LongestWord())
274 else:
275 if options.is_entropy:
276 tests.append(Entropy())
278 if options.is_longest:
279 tests.append(LongestWord())
281 if options.is_ic:
282 tests.append(LanguageIC())
284 # Instantiate the Generator Class used for searching, opening, and reading files
285 locator = SearchFile()
287 # CSV file output array
288 csv_array = []
289 csv_header = ["filename"]
291 # Grab the file and calculate each test against file
292 for data,filename in locator.search_file_path(args, valid_regex):
293 if data:
294 # a row array for the CSV
295 csv_row = []
296 csv_row.append(filename)
297 for test in tests:
298 calculated_value = test.caculate(data,filename)
299 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
300 if len(csv_header) < len(tests) + 1:
301 csv_header.append(test.__class__.__name__)
302 csv_row.append(calculated_value)
303 csv_array.append(csv_row)
305 if options.is_csv:
306 csv_array.insert(0,csv_header)
307 fileOutput = csv.writer(open(options.is_csv, "wb"))
308 fileOutput.writerows(csv_array)
310 top_ten = []
311 # For each test print the top ten results for that test.
312 for test in tests:
313 top_ten.append(test.printer())
314 print ""
316 printer = PrintRank()
318 printer.print_rank(top_ten)