Update neopi.py
[NeoPI/dkf.git] / neopi.py
blobc2cc2510ed65df1c31e5a485e47c852528139beb
1 #!/usr/bin/python
2 # Name: neopi.py
3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
7 # Date: 11/4/2010
9 # pep-0008 - Is stupid. TABS FO'EVER! too bad, spaces are back!
14 # Try catch regular expressions/bad path/bad filename/bad regex/
16 # Library imports
17 import math
18 import sys
19 import os
20 import re
21 import csv
22 import zlib
23 import time
24 from collections import defaultdict
25 from optparse import OptionParser
27 class LanguageIC:
28 """Class that calculates a file's Index of Coincidence as
29 as well as a a subset of files average Index of Coincidence.
30 """
31 def __init__(self):
32 """Initialize results arrays as well as character counters."""
33 self.char_count = defaultdict(int)
34 self.total_char_count = 0
35 self.results = []
36 self.ic_total_results = ""
38 def calculate_char_count(self,data):
39 """Method to calculate character counts for a particular data file."""
40 if not data:
41 return 0
42 for x in range(256):
43 char = chr(x)
44 charcount = data.count(char)
45 self.char_count[char] += charcount
46 self.total_char_count += charcount
47 return
49 def calculate_IC(self):
50 """Calculate the Index of Coincidence for the self variables"""
51 total = 0
52 for val in self.char_count.values():
54 if val == 0:
55 continue
56 total += val * (val-1)
58 try:
59 ic_total = float(total)/(self.total_char_count * (self.total_char_count - 1))
60 except:
61 ic_total = 0
62 self.ic_total_results = ic_total
63 return
65 def calculate(self,data,filename):
66 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
67 if not data:
68 return 0
69 char_count = 0
70 total_char_count = 0
72 for x in range(256):
73 char = chr(x)
74 charcount = data.count(char)
75 char_count += charcount * (charcount - 1)
76 total_char_count += charcount
78 ic = float(char_count)/(total_char_count * (total_char_count - 1))
79 self.results.append({"filename":filename, "value":ic})
80 # Call method to calculate_char_count and append to total_char_count
81 self.calculate_char_count(data)
82 return ic
84 def sort(self):
85 self.results.sort(key=lambda item: item["value"])
86 self.results = resultsAddRank(self.results)
88 def printer(self, count):
89 """Print the top signature count match files for a given search"""
90 # Calculate the Total IC for a Search
91 self.calculate_IC()
92 print "\n[[ Average IC for Search ]]"
93 print self.ic_total_results
94 print "\n[[ Top %i lowest IC files ]]" % (count)
95 if (count > len(self.results)): count = len(self.results)
96 for x in range(count):
97 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
98 return
100 class Entropy:
101 """Class that calculates a file's Entropy."""
103 def __init__(self):
104 """Instantiate the entropy_results array."""
105 self.results = []
107 def calculate(self,data,filename):
108 """Calculate the entropy for 'data' and append result to entropy_results array."""
110 if not data:
111 return 0
112 entropy = 0
113 self.stripped_data =data.replace(' ', '')
114 for x in range(256):
115 p_x = float(self.stripped_data.count(chr(x)))/len(self.stripped_data)
116 if p_x > 0:
117 entropy += - p_x * math.log(p_x, 2)
118 self.results.append({"filename":filename, "value":entropy})
119 return entropy
121 def sort(self):
122 self.results.sort(key=lambda item: item["value"])
123 self.results.reverse()
124 self.results = resultsAddRank(self.results)
126 def printer(self, count):
127 """Print the top signature count match files for a given search"""
128 print "\n[[ Top %i entropic files for a given search ]]" % (count)
129 if (count > len(self.results)): count = len(self.results)
130 for x in range(count):
131 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
132 return
134 class LongestWord:
135 """Class that determines the longest word for a particular file."""
136 def __init__(self):
137 """Instantiate the longestword_results array."""
138 self.results = []
140 def calculate(self,data,filename):
141 """Find the longest word in a string and append to longestword_results array"""
142 if not data:
143 return "", 0
144 longest = 0
145 longest_word = ""
146 words = re.split("[\s,\n,\r]", data)
147 if words:
148 for word in words:
149 length = len(word)
150 if length > longest:
151 longest = length
152 longest_word = word
153 self.results.append({"filename":filename, "value":longest})
154 return longest
156 def sort(self):
157 self.results.sort(key=lambda item: item["value"])
158 self.results.reverse()
159 self.results = resultsAddRank(self.results)
161 def printer(self, count):
162 """Print the top signature count match files for a given search"""
163 print "\n[[ Top %i longest word files ]]" % (count)
164 if (count > len(self.results)): count = len(self.results)
165 for x in range(count):
166 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
167 return
169 class SignatureNasty:
170 """Generator that searches a given file for nasty expressions"""
172 def __init__(self):
173 """Instantiate the results array."""
174 self.results = []
176 def calculate(self, data, filename):
177 if not data:
178 return "", 0
179 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
180 valid_regex = re.compile('(eval\(|file_put_contents|base64_decode|python_eval|exec\(|passthru|popen|proc_open|pcntl|assert\(|system\(|shell)', re.I)
181 matches = re.findall(valid_regex, data)
182 self.results.append({"filename":filename, "value":len(matches)})
183 return len(matches)
185 def sort(self):
186 self.results.sort(key=lambda item: item["value"])
187 self.results.reverse()
188 self.results = resultsAddRank(self.results)
190 def printer(self, count):
191 """Print the top signature count match files for a given search"""
192 print "\n[[ Top %i signature match counts ]]" % (count)
193 if (count > len(self.results)): count = len(self.results)
194 for x in range(count):
195 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
196 return
199 class UsesEval:
200 """Generator that searches a given file for nasty eval with variable"""
202 def __init__(self):
203 """Instantiate the eval_results array."""
204 self.results = []
206 def calculate(self, data, filename):
207 if not data:
208 return "", 0
209 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
210 valid_regex = re.compile('(eval\(\$(\w|\d))', re.I)
211 matches = re.findall(valid_regex, data)
212 self.results.append({"filename":filename, "value":len(matches)})
213 return len(matches)
215 def sort(self):
216 self.results.sort(key=lambda item: item["value"])
217 self.results.reverse()
218 self.results = resultsAddRank(self.results)
220 def printer(self, count):
221 """Print the files that use eval"""
222 print "\n[[ Top %i eval match counts ]]" % (count)
223 if (count > len(self.results)): count = len(self.results)
224 for x in range(count):
225 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
226 return
229 class Compression:
230 """Generator finds compression ratio"""
232 def __init__(self):
233 """Instantiate the results array."""
234 self.results = []
236 def calculate(self, data, filename):
237 if not data:
238 return "", 0
239 compressed = zlib.compress(data)
240 ratio = float(len(compressed)) / float(len(data))
241 self.results.append({"filename":filename, "value":ratio})
242 return ratio
244 def sort(self):
245 self.results.sort(key=lambda item: item["value"])
246 self.results.reverse()
247 self.results = resultsAddRank(self.results)
249 def printer(self, count):
250 """Print the top files for a given search"""
251 print "\n[[ Top %i compression match counts ]]" % (count)
252 if (count > len(self.results)): count = len(self.results)
253 for x in range(count):
254 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
255 return
257 def resultsAddRank(results):
258 rank = 1
259 offset = 1
260 previousValue = False
261 newList = []
262 for file in results:
263 if (previousValue and previousValue != file["value"]):
264 rank = offset
265 file["rank"] = rank
266 newList.append(file)
267 previousValue = file["value"]
268 offset = offset + 1
269 return newList
271 class SearchFile:
272 """Generator that searches a given filepath with an optional regular
273 expression and returns the filepath and filename"""
274 def search_file_path(self, args, valid_regex):
275 for root, dirs, files in os.walk(args[0]):
276 for file in files:
277 filename = os.path.join(root, file)
278 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
279 try:
280 data = open(root + "/" + file, 'rb').read()
281 except:
282 data = False
283 print "Could not read file :: %s/%s" % (root, file)
284 yield data, filename
286 if __name__ == "__main__":
287 """Parse all the options"""
289 timeStart = time.clock()
291 print """
292 ) ( (
293 ( /( )\ ))\ )
294 )\()) ( (()/(()/(
295 ((_)\ ))\ ( /(_))(_))
296 _((_)/((_))\(_))(_))
297 | \| (_)) ((_) _ \_ _|
298 | .` / -_) _ \ _/| |
299 |_|\_\___\___/_| |___| Ver. *.USEGIT
302 parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
303 version="%prog 1.0")
304 parser.add_option("-c", "--csv",
305 action="store",
306 dest="is_csv",
307 default=False,
308 help="generate CSV outfile",
309 metavar="FILECSV")
310 parser.add_option("-a", "--all",
311 action="store_true",
312 dest="is_all",
313 default=False,
314 help="Run all (useful) tests [Entropy, Longest Word, IC, Signature]",)
315 parser.add_option("-z", "--zlib",
316 action="store_true",
317 dest="is_zlib",
318 default=False,
319 help="Run compression Test",)
320 parser.add_option("-e", "--entropy",
321 action="store_true",
322 dest="is_entropy",
323 default=False,
324 help="Run entropy Test",)
325 parser.add_option("-E", "--eval",
326 action="store_true",
327 dest="is_eval",
328 default=False,
329 help="Run signiture test for the eval",)
330 parser.add_option("-l", "--longestword",
331 action="store_true",
332 dest="is_longest",
333 default=False,
334 help="Run longest word test",)
335 parser.add_option("-i", "--ic",
336 action="store_true",
337 dest="is_ic",
338 default=False,
339 help="Run IC test",)
340 parser.add_option("-s", "--signature",
341 action="store_true",
342 dest="is_signature",
343 default=False,
344 help="Run signature test",)
345 parser.add_option("-A", "--auto",
346 action="store_true",
347 dest="is_auto",
348 default=False,
349 help="Run auto file extension tests",)
350 parser.add_option("-u", "--unicode",
351 action="store_true",
352 dest="ignore_unicode",
353 default=False,
354 help="Skip over unicode-y/UTF'y files",)
356 (options, args) = parser.parse_args()
358 # Error on invalid number of arguements
359 if len(args) < 1:
360 parser.print_help()
361 print ""
362 sys.exit()
364 # Error on an invalid path
365 if os.path.exists(args[0]) == False:
366 parser.error("Invalid path")
368 valid_regex = ""
369 if (len(args) == 2 and options.is_auto is False):
370 try:
371 valid_regex = re.compile(args[1])
372 except:
373 parser.error("Invalid regular expression")
374 else:
375 valid_regex = re.compile('.*')
376 tests = []
378 if options.is_auto:
379 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm|\.htaccess)$')
381 if options.is_all:
382 tests.append(LanguageIC())
383 tests.append(Entropy())
384 tests.append(LongestWord())
385 tests.append(SignatureNasty())
386 else:
387 if options.is_entropy:
388 tests.append(Entropy())
389 if options.is_longest:
390 tests.append(LongestWord())
391 if options.is_ic:
392 tests.append(LanguageIC())
393 if options.is_signature:
394 tests.append(SignatureNasty())
395 if options.is_eval:
396 tests.append(UsesEval())
397 if options.is_zlib:
398 tests.append(Compression())
400 # Instantiate the Generator Class used for searching, opening, and reading files
401 locator = SearchFile()
403 # CSV file output array
404 csv_array = []
405 csv_header = ["filename"]
407 # Grab the file and calculate each test against file
408 fileCount = 0
409 fileIgnoreCount = 0
410 for data, filename in locator.search_file_path(args, valid_regex):
411 if data:
412 # a row array for the CSV
413 csv_row = []
414 csv_row.append(filename)
416 if options.ignore_unicode:
417 asciiHighCount = 0
418 for character in data:
419 if ord(character) > 127:
420 asciiHighCount = asciiHighCount + 1
422 fileAsciiHighRatio = float(asciiHighCount) / float(len(data))
424 if (options.ignore_unicode == False or fileAsciiHighRatio < .1):
425 for test in tests:
426 calculated_value = test.calculate(data, filename)
427 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
428 if len(csv_header) < len(tests) + 1:
429 csv_header.append(test.__class__.__name__)
430 csv_row.append(calculated_value)
431 fileCount = fileCount + 1
432 csv_array.append(csv_row)
433 else:
434 fileIgnoreCount = fileIgnoreCount + 1
436 if options.is_csv:
437 csv_array.insert(0,csv_header)
438 fileOutput = csv.writer(open(options.is_csv, "wb"))
439 fileOutput.writerows(csv_array)
441 timeFinish = time.clock()
443 # Print some stats
444 print "\n[[ Total files scanned: %i ]]" % (fileCount)
445 print "[[ Total files ignored: %i ]]" % (fileIgnoreCount)
446 print "[[ Scan Time: %f seconds ]]" % (timeFinish - timeStart)
448 # Print top rank lists
449 rank_list = {}
450 for test in tests:
451 test.sort()
452 test.printer(10)
453 for file in test.results:
454 rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
456 rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
458 print "\n[[ Top cumulative ranked files ]]"
459 count = 10
460 if (count > len(rank_sorted)): count = len(rank_sorted)
461 for x in range(count):
462 print ' {0:>7} {1}'.format(rank_sorted[x][1], rank_sorted[x][0])