Added super-signature search
[fixup.git] / neopi.py
blobbb458d4344a02905fcc431758bee42967e26a7e6
1 #!/usr/bin/python
2 # Name: neopi.py
3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
7 # Date: 11/4/2010
9 # pep-0008 - Is stupid. TABS FO'EVER!
11 # Try catch regular expressions/bad path/bad filename/bad regex/
13 # Library imports
14 import math
15 import sys
16 import os
17 import re
18 import csv
19 import zlib
20 import time
21 from collections import defaultdict
22 from optparse import OptionParser
24 class LanguageIC:
25 """Class that calculates a file's Index of Coincidence as
26 as well as a a subset of files average Index of Coincidence.
27 """
28 def __init__(self):
29 """Initialize results arrays as well as character counters."""
30 self.char_count = defaultdict(int)
31 self.total_char_count = 0
32 self.results = []
33 self.ic_total_results = ""
35 def calculate_char_count(self,data):
36 """Method to calculate character counts for a particular data file."""
37 if not data:
38 return 0
39 for x in range(256):
40 char = chr(x)
41 charcount = data.count(char)
42 self.char_count[char] += charcount
43 self.total_char_count += charcount
44 return
46 def calculate_IC(self):
47 """Calculate the Index of Coincidence for the self variables"""
48 total = 0
49 for val in self.char_count.values():
51 if val == 0:
52 continue
53 total += val * (val-1)
55 try:
56 ic_total = float(total)/(self.total_char_count * (self.total_char_count - 1))
57 except:
58 ic_total = 0
59 self.ic_total_results = ic_total
60 return
62 def calculate(self,data,filename):
63 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
64 if not data:
65 return 0
66 char_count = 0
67 total_char_count = 0
69 for x in range(256):
70 char = chr(x)
71 charcount = data.count(char)
72 char_count += charcount * (charcount - 1)
73 total_char_count += charcount
75 ic = float(char_count)/(total_char_count * (total_char_count - 1))
76 self.results.append({"filename":filename, "value":ic})
77 # Call method to calculate_char_count and append to total_char_count
78 self.calculate_char_count(data)
79 return ic
81 def sort(self):
82 self.results.sort(key=lambda item: item["value"])
83 self.results = resultsAddRank(self.results)
85 def printer(self, count):
86 """Print the top signature count match files for a given search"""
87 # Calculate the Total IC for a Search
88 self.calculate_IC()
89 print "\n[[ Average IC for Search ]]"
90 print self.ic_total_results
91 print "\n[[ Top %i lowest IC files ]]" % (count)
92 if (count > len(self.results)): count = len(self.results)
93 for x in range(count):
94 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
95 return
97 class Entropy:
98 """Class that calculates a file's Entropy."""
100 def __init__(self):
101 """Instantiate the entropy_results array."""
102 self.results = []
104 def calculate(self,data,filename):
105 """Calculate the entropy for 'data' and append result to entropy_results array."""
107 if not data:
108 return 0
109 entropy = 0
110 self.stripped_data =data.replace(' ', '')
111 for x in range(256):
112 p_x = float(self.stripped_data.count(chr(x)))/len(self.stripped_data)
113 if p_x > 0:
114 entropy += - p_x * math.log(p_x, 2)
115 self.results.append({"filename":filename, "value":entropy})
116 return entropy
118 def sort(self):
119 self.results.sort(key=lambda item: item["value"])
120 self.results.reverse()
121 self.results = resultsAddRank(self.results)
123 def printer(self, count):
124 """Print the top signature count match files for a given search"""
125 print "\n[[ Top %i entropic files for a given search ]]" % (count)
126 if (count > len(self.results)): count = len(self.results)
127 for x in range(count):
128 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
129 return
131 class LongestWord:
132 """Class that determines the longest word for a particular file."""
133 def __init__(self):
134 """Instantiate the longestword_results array."""
135 self.results = []
137 def calculate(self,data,filename):
138 """Find the longest word in a string and append to longestword_results array"""
139 if not data:
140 return "", 0
141 longest = 0
142 longest_word = ""
143 words = re.split("[\s,\n,\r]", data)
144 if words:
145 for word in words:
146 length = len(word)
147 if length > longest:
148 longest = length
149 longest_word = word
150 self.results.append({"filename":filename, "value":longest})
151 return longest
153 def sort(self):
154 self.results.sort(key=lambda item: item["value"])
155 self.results.reverse()
156 self.results = resultsAddRank(self.results)
158 def printer(self, count):
159 """Print the top signature count match files for a given search"""
160 print "\n[[ Top %i longest word files ]]" % (count)
161 if (count > len(self.results)): count = len(self.results)
162 for x in range(count):
163 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
164 return
166 class SignatureNasty:
167 """Generator that searches a given file for nasty expressions"""
169 def __init__(self):
170 """Instantiate the results array."""
171 self.results = []
173 def calculate(self, data, filename):
174 if not data:
175 return "", 0
176 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
177 valid_regex = re.compile('(eval\(|file_put_contents|base64_decode|python_eval|exec\(|passthru|popen|proc_open|pcntl|assert\(|system\(|shell)', re.I)
178 matches = re.findall(valid_regex, data)
179 self.results.append({"filename":filename, "value":len(matches)})
180 return len(matches)
182 def sort(self):
183 self.results.sort(key=lambda item: item["value"])
184 self.results.reverse()
185 self.results = resultsAddRank(self.results)
187 def printer(self, count):
188 """Print the top signature count match files for a given search"""
189 print "\n[[ Top %i signature match counts ]]" % (count)
190 if (count > len(self.results)): count = len(self.results)
191 for x in range(count):
192 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
193 return
195 class SignatureSuperNasty:
196 """Generator that searches a given file for SUPER-nasty expressions (These are almost always bad!)"""
198 def __init__(self):
199 """Instantiate the results array."""
200 self.results = []
202 def calculate(self, data, filename):
203 if not data:
204 return "", 0
205 valid_regex = re.compile('(@\$_\[\]=|\$_=@\$_GET|\$_\[\+""\]=)', re.I)
206 matches = re.findall(valid_regex, data)
207 self.results.append({"filename":filename, "value":len(matches)})
208 return len(matches)
210 def sort(self):
211 self.results.sort(key=lambda item: item["value"])
212 self.results.reverse()
213 self.results = resultsAddRank(self.results)
215 def printer(self, count):
216 """Print the top signature count match files for a given search"""
217 print "\n[[ Top %i SUPER-signature match counts (These are usually bad!) ]]" % (count)
218 if (count > len(self.results)): count = len(self.results)
219 for x in range(count):
220 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
221 return
223 class UsesEval:
224 """Generator that searches a given file for nasty eval with variable"""
226 def __init__(self):
227 """Instantiate the eval_results array."""
228 self.results = []
230 def calculate(self, data, filename):
231 if not data:
232 return "", 0
233 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
234 valid_regex = re.compile('(eval\(\$(\w|\d))', re.I)
235 matches = re.findall(valid_regex, data)
236 self.results.append({"filename":filename, "value":len(matches)})
237 return len(matches)
239 def sort(self):
240 self.results.sort(key=lambda item: item["value"])
241 self.results.reverse()
242 self.results = resultsAddRank(self.results)
244 def printer(self, count):
245 """Print the files that use eval"""
246 print "\n[[ Top %i eval match counts ]]" % (count)
247 if (count > len(self.results)): count = len(self.results)
248 for x in range(count):
249 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
250 return
253 class Compression:
254 """Generator finds compression ratio"""
256 def __init__(self):
257 """Instantiate the results array."""
258 self.results = []
260 def calculate(self, data, filename):
261 if not data:
262 return "", 0
263 compressed = zlib.compress(data)
264 ratio = float(len(compressed)) / float(len(data))
265 self.results.append({"filename":filename, "value":ratio})
266 return ratio
268 def sort(self):
269 self.results.sort(key=lambda item: item["value"])
270 self.results.reverse()
271 self.results = resultsAddRank(self.results)
273 def printer(self, count):
274 """Print the top files for a given search"""
275 print "\n[[ Top %i compression match counts ]]" % (count)
276 if (count > len(self.results)): count = len(self.results)
277 for x in range(count):
278 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
279 return
281 def resultsAddRank(results):
282 rank = 1
283 offset = 1
284 previousValue = False
285 newList = []
286 for file in results:
287 if (previousValue and previousValue != file["value"]):
288 rank = offset
289 file["rank"] = rank
290 newList.append(file)
291 previousValue = file["value"]
292 offset = offset + 1
293 return newList
295 class SearchFile:
296 """Generator that searches a given filepath with an optional regular
297 expression and returns the filepath and filename"""
298 def search_file_path(self, args, valid_regex):
299 for root, dirs, files in os.walk(args[0]):
300 for file in files:
301 filename = os.path.join(root, file)
302 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
303 try:
304 data = open(root + "/" + file, 'rb').read()
305 except:
306 data = False
307 print "Could not read file :: %s/%s" % (root, file)
308 yield data, filename
310 if __name__ == "__main__":
311 """Parse all the options"""
313 timeStart = time.clock()
315 print """
316 ) ( (
317 ( /( )\ ))\ )
318 )\()) ( (()/(()/(
319 ((_)\ ))\ ( /(_))(_))
320 _((_)/((_))\(_))(_))
321 | \| (_)) ((_) _ \_ _|
322 | .` / -_) _ \ _/| |
323 |_|\_\___\___/_| |___| Ver. *.USEGIT
326 parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
327 version="%prog 1.0")
328 parser.add_option("-c", "--csv",
329 action="store",
330 dest="is_csv",
331 default=False,
332 help="generate CSV outfile",
333 metavar="FILECSV")
334 parser.add_option("-a", "--all",
335 action="store_true",
336 dest="is_all",
337 default=False,
338 help="Run all (useful) tests [Entropy, Longest Word, IC, Signature]",)
339 parser.add_option("-z", "--zlib",
340 action="store_true",
341 dest="is_zlib",
342 default=False,
343 help="Run compression Test",)
344 parser.add_option("-e", "--entropy",
345 action="store_true",
346 dest="is_entropy",
347 default=False,
348 help="Run entropy Test",)
349 parser.add_option("-E", "--eval",
350 action="store_true",
351 dest="is_eval",
352 default=False,
353 help="Run signiture test for the eval",)
354 parser.add_option("-l", "--longestword",
355 action="store_true",
356 dest="is_longest",
357 default=False,
358 help="Run longest word test",)
359 parser.add_option("-i", "--ic",
360 action="store_true",
361 dest="is_ic",
362 default=False,
363 help="Run IC test",)
364 parser.add_option("-s", "--signature",
365 action="store_true",
366 dest="is_signature",
367 default=False,
368 help="Run signature test",)
369 parser.add_option("-S", "--supersignature",
370 action="store_true",
371 dest="is_supersignature",
372 default=False,
373 help="Run SUPER-signature test",)
374 parser.add_option("-A", "--auto",
375 action="store_true",
376 dest="is_auto",
377 default=False,
378 help="Run auto file extension tests",)
379 parser.add_option("-u", "--unicode",
380 action="store_true",
381 dest="ignore_unicode",
382 default=False,
383 help="Skip over unicode-y/UTF'y files",)
385 (options, args) = parser.parse_args()
387 # Error on invalid number of arguements
388 if len(args) < 1:
389 parser.print_help()
390 print ""
391 sys.exit()
393 # Error on an invalid path
394 if os.path.exists(args[0]) == False:
395 parser.error("Invalid path")
397 valid_regex = ""
398 if (len(args) == 2 and options.is_auto is False):
399 try:
400 valid_regex = re.compile(args[1])
401 except:
402 parser.error("Invalid regular expression")
403 else:
404 valid_regex = re.compile('.*')
405 tests = []
407 if options.is_auto:
408 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm|\.htaccess)$')
410 if options.is_all:
411 tests.append(LanguageIC())
412 tests.append(Entropy())
413 tests.append(LongestWord())
414 tests.append(SignatureNasty())
415 tests.append(SignatureSuperNasty())
416 else:
417 if options.is_entropy:
418 tests.append(Entropy())
419 if options.is_longest:
420 tests.append(LongestWord())
421 if options.is_ic:
422 tests.append(LanguageIC())
423 if options.is_signature:
424 tests.append(SignatureNasty())
425 if options.is_supersignature:
426 tests.append(SignatureSuperNasty())
427 if options.is_eval:
428 tests.append(UsesEval())
429 if options.is_zlib:
430 tests.append(Compression())
432 # Instantiate the Generator Class used for searching, opening, and reading files
433 locator = SearchFile()
435 # CSV file output array
436 csv_array = []
437 csv_header = ["filename"]
439 # Grab the file and calculate each test against file
440 fileCount = 0
441 fileIgnoreCount = 0
442 for data, filename in locator.search_file_path(args, valid_regex):
443 if data:
444 # a row array for the CSV
445 csv_row = []
446 csv_row.append(filename)
448 if options.ignore_unicode:
449 asciiHighCount = 0
450 for character in data:
451 if ord(character) > 127:
452 asciiHighCount = asciiHighCount + 1
454 fileAsciiHighRatio = float(asciiHighCount) / float(len(data))
456 if (options.ignore_unicode == False or fileAsciiHighRatio < .1):
457 for test in tests:
458 calculated_value = test.calculate(data, filename)
459 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
460 if len(csv_header) < len(tests) + 1:
461 csv_header.append(test.__class__.__name__)
462 csv_row.append(calculated_value)
463 fileCount = fileCount + 1
464 csv_array.append(csv_row)
465 else:
466 fileIgnoreCount = fileIgnoreCount + 1
468 if options.is_csv:
469 csv_array.insert(0,csv_header)
470 fileOutput = csv.writer(open(options.is_csv, "wb"))
471 fileOutput.writerows(csv_array)
473 timeFinish = time.clock()
475 # Print some stats
476 print "\n[[ Total files scanned: %i ]]" % (fileCount)
477 print "[[ Total files ignored: %i ]]" % (fileIgnoreCount)
478 print "[[ Scan Time: %f seconds ]]" % (timeFinish - timeStart)
480 # Print top rank lists
481 rank_list = {}
482 for test in tests:
483 test.sort()
484 test.printer(10)
485 for file in test.results:
486 rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
488 rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
490 print "\n[[ Top cumulative ranked files ]]"
491 count = 10
492 if (count > len(rank_sorted)): count = len(rank_sorted)
493 for x in range(count):
494 print ' {0:>7} {1}'.format(rank_sorted[x][1], rank_sorted[x][0])