fix typo
[fixup/fork.git] / neopi.py
blob47e8d3b11960c9dab93dbc99b743450f43ee16a1
1 #!/usr/bin/python
2 # Name: neopi.py
3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
7 # Date: 11/4/2010
9 # pep-0008 - Is stupid. TABS FO'EVER!
11 # Try catch regular expressions/bad path/bad filename/bad regex/
13 # Library imports
14 import math
15 import sys
16 import os
17 import re
18 import csv
19 import zlib
20 import time
21 from collections import defaultdict
22 from optparse import OptionParser
25 # Globals
28 # Smallest filesize to checkfor in bytes.
29 SMALLEST = 60
31 class LanguageIC:
32 """Class that calculates a file's Index of Coincidence as
33 as well as a a subset of files average Index of Coincidence.
34 """
35 def __init__(self):
36 """Initialize results arrays as well as character counters."""
37 self.char_count = defaultdict(int)
38 self.total_char_count = 0
39 self.results = []
40 self.ic_total_results = ""
42 def calculate_char_count(self,data):
43 """Method to calculate character counts for a particular data file."""
44 if not data:
45 return 0
46 for x in range(256):
47 char = chr(x)
48 charcount = data.count(char)
49 self.char_count[char] += charcount
50 self.total_char_count += charcount
51 return
53 def calculate_IC(self):
54 """Calculate the Index of Coincidence for the self variables"""
55 total = 0
56 for val in self.char_count.values():
58 if val == 0:
59 continue
60 total += val * (val-1)
62 try:
63 ic_total = float(total)/(self.total_char_count * (self.total_char_count - 1))
64 except:
65 ic_total = 0
66 self.ic_total_results = ic_total
67 return
69 def calculate(self,data,filename):
70 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
71 if not data:
72 return 0
73 char_count = 0
74 total_char_count = 0
76 for x in range(256):
77 char = chr(x)
78 charcount = data.count(char)
79 char_count += charcount * (charcount - 1)
80 total_char_count += charcount
82 ic = float(char_count)/(total_char_count * (total_char_count - 1))
83 self.results.append({"filename":filename, "value":ic})
84 # Call method to calculate_char_count and append to total_char_count
85 self.calculate_char_count(data)
86 return ic
88 def sort(self):
89 self.results.sort(key=lambda item: item["value"])
90 self.results = resultsAddRank(self.results)
92 def printer(self, count):
93 """Print the top signature count match files for a given search"""
94 # Calculate the Total IC for a Search
95 self.calculate_IC()
96 print "\n[[ Average IC for Search ]]"
97 print self.ic_total_results
98 print "\n[[ Top %i lowest IC files ]]" % (count)
99 if (count > len(self.results)): count = len(self.results)
100 for x in range(count):
101 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
102 return
104 class Entropy:
105 """Class that calculates a file's Entropy."""
107 def __init__(self):
108 """Instantiate the entropy_results array."""
109 self.results = []
111 def calculate(self,data,filename):
112 """Calculate the entropy for 'data' and append result to entropy_results array."""
114 if not data:
115 return 0
116 entropy = 0
117 self.stripped_data =data.replace(' ', '')
118 for x in range(256):
119 p_x = float(self.stripped_data.count(chr(x)))/len(self.stripped_data)
120 if p_x > 0:
121 entropy += - p_x * math.log(p_x, 2)
122 self.results.append({"filename":filename, "value":entropy})
123 return entropy
125 def sort(self):
126 self.results.sort(key=lambda item: item["value"])
127 self.results.reverse()
128 self.results = resultsAddRank(self.results)
130 def printer(self, count):
131 """Print the top signature count match files for a given search"""
132 print "\n[[ Top %i entropic files for a given search ]]" % (count)
133 if (count > len(self.results)): count = len(self.results)
134 for x in range(count):
135 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
136 return
138 class LongestWord:
139 """Class that determines the longest word for a particular file."""
140 def __init__(self):
141 """Instantiate the longestword_results array."""
142 self.results = []
144 def calculate(self,data,filename):
145 """Find the longest word in a string and append to longestword_results array"""
146 if not data:
147 return "", 0
148 longest = 0
149 longest_word = ""
150 words = re.split("[\s,\n,\r]", data)
151 if words:
152 for word in words:
153 length = len(word)
154 if length > longest:
155 longest = length
156 longest_word = word
157 self.results.append({"filename":filename, "value":longest})
158 return longest
160 def sort(self):
161 self.results.sort(key=lambda item: item["value"])
162 self.results.reverse()
163 self.results = resultsAddRank(self.results)
165 def printer(self, count):
166 """Print the top signature count match files for a given search"""
167 print "\n[[ Top %i longest word files ]]" % (count)
168 if (count > len(self.results)): count = len(self.results)
169 for x in range(count):
170 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
171 return
173 class SignatureNasty:
174 """Generator that searches a given file for nasty expressions"""
176 def __init__(self):
177 """Instantiate the results array."""
178 self.results = []
180 def calculate(self, data, filename):
181 if not data:
182 return "", 0
183 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
184 valid_regex = re.compile('(eval\(|file_put_contents|base64_decode|python_eval|exec\(|passthru|popen|proc_open|pcntl|assert\(|system\(|shell)', re.I)
185 matches = re.findall(valid_regex, data)
186 self.results.append({"filename":filename, "value":len(matches)})
187 return len(matches)
189 def sort(self):
190 self.results.sort(key=lambda item: item["value"])
191 self.results.reverse()
192 self.results = resultsAddRank(self.results)
194 def printer(self, count):
195 """Print the top signature count match files for a given search"""
196 print "\n[[ Top %i signature match counts ]]" % (count)
197 if (count > len(self.results)): count = len(self.results)
198 for x in range(count):
199 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
200 return
202 class SignatureSuperNasty:
203 """Generator that searches a given file for SUPER-nasty expressions (These are almost always bad!)"""
205 def __init__(self):
206 """Instantiate the results array."""
207 self.results = []
209 def calculate(self, data, filename):
210 if not data:
211 return "", 0
212 valid_regex = re.compile('(@\$_\[\]=|\$_=@\$_GET|\$_\[\+""\]=)', re.I)
213 matches = re.findall(valid_regex, data)
214 self.results.append({"filename":filename, "value":len(matches)})
215 return len(matches)
217 def sort(self):
218 self.results.sort(key=lambda item: item["value"])
219 self.results.reverse()
220 self.results = resultsAddRank(self.results)
222 def printer(self, count):
223 """Print the top signature count match files for a given search"""
224 print "\n[[ Top %i SUPER-signature match counts (These are usually bad!) ]]" % (count)
225 if (count > len(self.results)): count = len(self.results)
226 for x in range(count):
227 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
228 return
230 class UsesEval:
231 """Generator that searches a given file for nasty eval with variable"""
233 def __init__(self):
234 """Instantiate the eval_results array."""
235 self.results = []
237 def calculate(self, data, filename):
238 if not data:
239 return "", 0
240 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
241 valid_regex = re.compile('(eval\(\$(\w|\d))', re.I)
242 matches = re.findall(valid_regex, data)
243 self.results.append({"filename":filename, "value":len(matches)})
244 return len(matches)
246 def sort(self):
247 self.results.sort(key=lambda item: item["value"])
248 self.results.reverse()
249 self.results = resultsAddRank(self.results)
251 def printer(self, count):
252 """Print the files that use eval"""
253 print "\n[[ Top %i eval match counts ]]" % (count)
254 if (count > len(self.results)): count = len(self.results)
255 for x in range(count):
256 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
257 return
260 class Compression:
261 """Generator finds compression ratio"""
263 def __init__(self):
264 """Instantiate the results array."""
265 self.results = []
267 def calculate(self, data, filename):
268 if not data:
269 return "", 0
270 compressed = zlib.compress(data)
271 ratio = float(len(compressed)) / float(len(data))
272 self.results.append({"filename":filename, "value":ratio})
273 return ratio
275 def sort(self):
276 self.results.sort(key=lambda item: item["value"])
277 self.results.reverse()
278 self.results = resultsAddRank(self.results)
280 def printer(self, count):
281 """Print the top files for a given search"""
282 print "\n[[ Top %i compression match counts ]]" % (count)
283 if (count > len(self.results)): count = len(self.results)
284 for x in range(count):
285 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
286 return
288 def resultsAddRank(results):
289 rank = 1
290 offset = 1
291 previousValue = False
292 newList = []
293 for file in results:
294 if (previousValue and previousValue != file["value"]):
295 rank = offset
296 file["rank"] = rank
297 newList.append(file)
298 previousValue = file["value"]
299 offset = offset + 1
300 return newList
302 class SearchFile:
303 """Generator that searches a given filepath with an optional regular
304 expression and returns the filepath and filename"""
305 def search_file_path(self, args, valid_regex):
306 for root, dirs, files in os.walk(args[0]):
307 for file in files:
308 filename = os.path.join(root, file)
309 if (valid_regex.search(file) and os.path.getsize(filename) > SMALLEST):
310 try:
311 data = open(root + "/" + file, 'rb').read()
312 except:
313 data = False
314 print "Could not read file :: %s/%s" % (root, file)
315 yield data, filename
317 if __name__ == "__main__":
318 """Parse all the options"""
320 timeStart = time.clock()
322 print """
323 ) ( (
324 ( /( )\ ))\ )
325 )\()) ( (()/(()/(
326 ((_)\ ))\ ( /(_))(_))
327 _((_)/((_))\(_))(_))
328 | \| (_)) ((_) _ \_ _|
329 | .` / -_) _ \ _/| |
330 |_|\_\___\___/_| |___| Ver. *.USEGIT
333 parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
334 version="%prog 1.0")
335 parser.add_option("-c", "--csv",
336 action="store",
337 dest="is_csv",
338 default=False,
339 help="generate CSV outfile",
340 metavar="FILECSV")
341 parser.add_option("-a", "--all",
342 action="store_true",
343 dest="is_all",
344 default=False,
345 help="Run all (useful) tests [Entropy, Longest Word, IC, Signature]",)
346 parser.add_option("-z", "--zlib",
347 action="store_true",
348 dest="is_zlib",
349 default=False,
350 help="Run compression Test",)
351 parser.add_option("-e", "--entropy",
352 action="store_true",
353 dest="is_entropy",
354 default=False,
355 help="Run entropy Test",)
356 parser.add_option("-E", "--eval",
357 action="store_true",
358 dest="is_eval",
359 default=False,
360 help="Run signature test for the eval",)
361 parser.add_option("-l", "--longestword",
362 action="store_true",
363 dest="is_longest",
364 default=False,
365 help="Run longest word test",)
366 parser.add_option("-i", "--ic",
367 action="store_true",
368 dest="is_ic",
369 default=False,
370 help="Run IC test",)
371 parser.add_option("-s", "--signature",
372 action="store_true",
373 dest="is_signature",
374 default=False,
375 help="Run signature test",)
376 parser.add_option("-S", "--supersignature",
377 action="store_true",
378 dest="is_supersignature",
379 default=False,
380 help="Run SUPER-signature test",)
381 parser.add_option("-A", "--auto",
382 action="store_true",
383 dest="is_auto",
384 default=False,
385 help="Run auto file extension tests",)
386 parser.add_option("-u", "--unicode",
387 action="store_true",
388 dest="ignore_unicode",
389 default=False,
390 help="Skip over unicode-y/UTF'y files",)
392 (options, args) = parser.parse_args()
394 # Error on invalid number of arguements
395 if len(args) < 1:
396 parser.print_help()
397 print ""
398 sys.exit()
400 # Error on an invalid path
401 if os.path.exists(args[0]) == False:
402 parser.error("Invalid path")
404 valid_regex = ""
405 if (len(args) == 2 and options.is_auto is False):
406 try:
407 valid_regex = re.compile(args[1])
408 except:
409 parser.error("Invalid regular expression")
410 else:
411 valid_regex = re.compile('.*')
412 tests = []
414 if options.is_auto:
415 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm|\.htaccess)$')
417 if options.is_all:
418 tests.append(LanguageIC())
419 tests.append(Entropy())
420 tests.append(LongestWord())
421 tests.append(SignatureNasty())
422 tests.append(SignatureSuperNasty())
423 else:
424 if options.is_entropy:
425 tests.append(Entropy())
426 if options.is_longest:
427 tests.append(LongestWord())
428 if options.is_ic:
429 tests.append(LanguageIC())
430 if options.is_signature:
431 tests.append(SignatureNasty())
432 if options.is_supersignature:
433 tests.append(SignatureSuperNasty())
434 if options.is_eval:
435 tests.append(UsesEval())
436 if options.is_zlib:
437 tests.append(Compression())
439 # Instantiate the Generator Class used for searching, opening, and reading files
440 locator = SearchFile()
442 # CSV file output array
443 csv_array = []
444 csv_header = ["filename"]
446 # Grab the file and calculate each test against file
447 fileCount = 0
448 fileIgnoreCount = 0
449 for data, filename in locator.search_file_path(args, valid_regex):
450 if data:
451 # a row array for the CSV
452 csv_row = []
453 csv_row.append(filename)
455 if options.ignore_unicode:
456 asciiHighCount = 0
457 for character in data:
458 if ord(character) > 127:
459 asciiHighCount = asciiHighCount + 1
461 fileAsciiHighRatio = float(asciiHighCount) / float(len(data))
463 if (options.ignore_unicode == False or fileAsciiHighRatio < .1):
464 for test in tests:
465 calculated_value = test.calculate(data, filename)
466 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
467 if len(csv_header) < len(tests) + 1:
468 csv_header.append(test.__class__.__name__)
469 csv_row.append(calculated_value)
470 fileCount = fileCount + 1
471 csv_array.append(csv_row)
472 else:
473 fileIgnoreCount = fileIgnoreCount + 1
475 if options.is_csv:
476 csv_array.insert(0,csv_header)
477 fileOutput = csv.writer(open(options.is_csv, "wb"))
478 fileOutput.writerows(csv_array)
480 timeFinish = time.clock()
482 # Print some stats
483 print "\n[[ Total files scanned: %i ]]" % (fileCount)
484 print "[[ Total files ignored: %i ]]" % (fileIgnoreCount)
485 print "[[ Scan Time: %f seconds ]]" % (timeFinish - timeStart)
487 # Print top rank lists
488 rank_list = {}
489 for test in tests:
490 test.sort()
491 test.printer(10)
492 for file in test.results:
493 rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
495 rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
497 print "\n[[ Top cumulative ranked files ]]"
498 count = 10
499 if (count > len(rank_sorted)): count = len(rank_sorted)
500 for x in range(count):
501 print ' {0:>7} {1}'.format(rank_sorted[x][1], rank_sorted[x][0])