Update neopi.py
[fixup.git] / neopi.py
blob6efc02742c62e00c4dd6afeafba30d0015ee7353
1 #!/usr/bin/python
2 # Name: neopi.py
3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
7 # Date: 11/4/2010
9 # pep-0008 - Is stupid. TABS FO'EVER!
11 # Try catch regular expressions/bad path/bad filename/bad regex/
13 # Library imports
14 import math
15 import sys
16 import os
17 import re
18 import csv
19 import zlib
20 import time
21 from collections import defaultdict
22 from optparse import OptionParser
24 class LanguageIC:
25 """Class that calculates a file's Index of Coincidence as
26 as well as a a subset of files average Index of Coincidence.
27 """
28 def __init__(self):
29 """Initialize results arrays as well as character counters."""
30 self.char_count = defaultdict(int)
31 self.total_char_count = 0
32 self.results = []
33 self.ic_total_results = ""
35 def calculate_char_count(self,data):
36 """Method to calculate character counts for a particular data file."""
37 if not data:
38 return 0
39 for x in range(256):
40 char = chr(x)
41 charcount = data.count(char)
42 self.char_count[char] += charcount
43 self.total_char_count += charcount
44 return
46 def calculate_IC(self):
47 """Calculate the Index of Coincidence for the self variables"""
48 total = 0
49 for val in self.char_count.values():
51 if val == 0:
52 continue
53 total += val * (val-1)
55 try:
56 ic_total = float(total)/(self.total_char_count * (self.total_char_count - 1))
57 except:
58 ic_total = 0
59 self.ic_total_results = ic_total
60 return
62 def calculate(self,data,filename):
63 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
64 if not data:
65 return 0
66 char_count = 0
67 total_char_count = 0
69 for x in range(256):
70 char = chr(x)
71 charcount = data.count(char)
72 char_count += charcount * (charcount - 1)
73 total_char_count += charcount
75 ic = float(char_count)/(total_char_count * (total_char_count - 1))
76 self.results.append({"filename":filename, "value":ic})
77 # Call method to calculate_char_count and append to total_char_count
78 self.calculate_char_count(data)
79 return ic
81 def sort(self):
82 self.results.sort(key=lambda item: item["value"])
83 self.results = resultsAddRank(self.results)
85 def printer(self, count):
86 """Print the top signature count match files for a given search"""
87 # Calculate the Total IC for a Search
88 self.calculate_IC()
89 print "\n[[ Average IC for Search ]]"
90 print self.ic_total_results
91 print "\n[[ Top %i lowest IC files ]]" % (count)
92 if (count > len(self.results)): count = len(self.results)
93 for x in range(count):
94 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
95 return
97 class Entropy:
98 """Class that calculates a file's Entropy."""
100 def __init__(self):
101 """Instantiate the entropy_results array."""
102 self.results = []
104 def calculate(self,data,filename):
105 """Calculate the entropy for 'data' and append result to entropy_results array."""
107 if not data:
108 return 0
109 entropy = 0
110 self.stripped_data =data.replace(' ', '')
111 for x in range(256):
112 p_x = float(self.stripped_data.count(chr(x)))/len(self.stripped_data)
113 if p_x > 0:
114 entropy += - p_x * math.log(p_x, 2)
115 self.results.append({"filename":filename, "value":entropy})
116 return entropy
118 def sort(self):
119 self.results.sort(key=lambda item: item["value"])
120 self.results.reverse()
121 self.results = resultsAddRank(self.results)
123 def printer(self, count):
124 """Print the top signature count match files for a given search"""
125 print "\n[[ Top %i entropic files for a given search ]]" % (count)
126 if (count > len(self.results)): count = len(self.results)
127 for x in range(count):
128 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
129 return
131 class LongestWord:
132 """Class that determines the longest word for a particular file."""
133 def __init__(self):
134 """Instantiate the longestword_results array."""
135 self.results = []
137 def calculate(self,data,filename):
138 """Find the longest word in a string and append to longestword_results array"""
139 if not data:
140 return "", 0
141 longest = 0
142 longest_word = ""
143 words = re.split("[\s,\n,\r]", data)
144 if words:
145 for word in words:
146 length = len(word)
147 if length > longest:
148 longest = length
149 longest_word = word
150 self.results.append({"filename":filename, "value":longest})
151 return longest
153 def sort(self):
154 self.results.sort(key=lambda item: item["value"])
155 self.results.reverse()
156 self.results = resultsAddRank(self.results)
158 def printer(self, count):
159 """Print the top signature count match files for a given search"""
160 print "\n[[ Top %i longest word files ]]" % (count)
161 if (count > len(self.results)): count = len(self.results)
162 for x in range(count):
163 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
164 return
166 class SignatureNasty:
167 """Generator that searches a given file for nasty expressions"""
169 def __init__(self):
170 """Instantiate the results array."""
171 self.results = []
173 def calculate(self, data, filename):
174 if not data:
175 return "", 0
176 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
177 valid_regex = re.compile('(eval\(|file_put_contents|base64_decode|python_eval|exec\(|passthru|popen|proc_open|pcntl|assert\(|system\(|shell)', re.I)
178 matches = re.findall(valid_regex, data)
179 self.results.append({"filename":filename, "value":len(matches)})
180 return len(matches)
182 def sort(self):
183 self.results.sort(key=lambda item: item["value"])
184 self.results.reverse()
185 self.results = resultsAddRank(self.results)
187 def printer(self, count):
188 """Print the top signature count match files for a given search"""
189 print "\n[[ Top %i signature match counts ]]" % (count)
190 if (count > len(self.results)): count = len(self.results)
191 for x in range(count):
192 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
193 return
196 class UsesEval:
197 """Generator that searches a given file for nasty eval with variable"""
199 def __init__(self):
200 """Instantiate the eval_results array."""
201 self.results = []
203 def calculate(self, data, filename):
204 if not data:
205 return "", 0
206 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
207 valid_regex = re.compile('(eval\(\$(\w|\d))', re.I)
208 matches = re.findall(valid_regex, data)
209 self.results.append({"filename":filename, "value":len(matches)})
210 return len(matches)
212 def sort(self):
213 self.results.sort(key=lambda item: item["value"])
214 self.results.reverse()
215 self.results = resultsAddRank(self.results)
217 def printer(self, count):
218 """Print the files that use eval"""
219 print "\n[[ Top %i eval match counts ]]" % (count)
220 if (count > len(self.results)): count = len(self.results)
221 for x in range(count):
222 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
223 return
226 class Compression:
227 """Generator finds compression ratio"""
229 def __init__(self):
230 """Instantiate the results array."""
231 self.results = []
233 def calculate(self, data, filename):
234 if not data:
235 return "", 0
236 compressed = zlib.compress(data)
237 ratio = float(len(compressed)) / float(len(data))
238 self.results.append({"filename":filename, "value":ratio})
239 return ratio
241 def sort(self):
242 self.results.sort(key=lambda item: item["value"])
243 self.results.reverse()
244 self.results = resultsAddRank(self.results)
246 def printer(self, count):
247 """Print the top files for a given search"""
248 print "\n[[ Top %i compression match counts ]]" % (count)
249 if (count > len(self.results)): count = len(self.results)
250 for x in range(count):
251 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
252 return
254 def resultsAddRank(results):
255 rank = 1
256 offset = 1
257 previousValue = False
258 newList = []
259 for file in results:
260 if (previousValue and previousValue != file["value"]):
261 rank = offset
262 file["rank"] = rank
263 newList.append(file)
264 previousValue = file["value"]
265 offset = offset + 1
266 return newList
268 class SearchFile:
269 """Generator that searches a given filepath with an optional regular
270 expression and returns the filepath and filename"""
271 def search_file_path(self, args, valid_regex):
272 for root, dirs, files in os.walk(args[0]):
273 for file in files:
274 filename = os.path.join(root, file)
275 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
276 try:
277 data = open(root + "/" + file, 'rb').read()
278 except:
279 data = False
280 print "Could not read file :: %s/%s" % (root, file)
281 yield data, filename
283 if __name__ == "__main__":
284 """Parse all the options"""
286 timeStart = time.clock()
288 print """
289 ) ( (
290 ( /( )\ ))\ )
291 )\()) ( (()/(()/(
292 ((_)\ ))\ ( /(_))(_))
293 _((_)/((_))\(_))(_))
294 | \| (_)) ((_) _ \_ _|
295 | .` / -_) _ \ _/| |
296 |_|\_\___\___/_| |___| Ver. *.USEGIT
299 parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
300 version="%prog 1.0")
301 parser.add_option("-c", "--csv",
302 action="store",
303 dest="is_csv",
304 default=False,
305 help="generate CSV outfile",
306 metavar="FILECSV")
307 parser.add_option("-a", "--all",
308 action="store_true",
309 dest="is_all",
310 default=False,
311 help="Run all (useful) tests [Entropy, Longest Word, IC, Signature]",)
312 parser.add_option("-z", "--zlib",
313 action="store_true",
314 dest="is_zlib",
315 default=False,
316 help="Run compression Test",)
317 parser.add_option("-e", "--entropy",
318 action="store_true",
319 dest="is_entropy",
320 default=False,
321 help="Run entropy Test",)
322 parser.add_option("-E", "--eval",
323 action="store_true",
324 dest="is_eval",
325 default=False,
326 help="Run signiture test for the eval",)
327 parser.add_option("-l", "--longestword",
328 action="store_true",
329 dest="is_longest",
330 default=False,
331 help="Run longest word test",)
332 parser.add_option("-i", "--ic",
333 action="store_true",
334 dest="is_ic",
335 default=False,
336 help="Run IC test",)
337 parser.add_option("-s", "--signature",
338 action="store_true",
339 dest="is_signature",
340 default=False,
341 help="Run signature test",)
342 parser.add_option("-A", "--auto",
343 action="store_true",
344 dest="is_auto",
345 default=False,
346 help="Run auto file extension tests",)
347 parser.add_option("-u", "--unicode",
348 action="store_true",
349 dest="ignore_unicode",
350 default=False,
351 help="Skip over unicode-y/UTF'y files",)
353 (options, args) = parser.parse_args()
355 # Error on invalid number of arguements
356 if len(args) < 1:
357 parser.print_help()
358 print ""
359 sys.exit()
361 # Error on an invalid path
362 if os.path.exists(args[0]) == False:
363 parser.error("Invalid path")
365 valid_regex = ""
366 if (len(args) == 2 and options.is_auto is False):
367 try:
368 valid_regex = re.compile(args[1])
369 except:
370 parser.error("Invalid regular expression")
371 else:
372 valid_regex = re.compile('.*')
373 tests = []
375 if options.is_auto:
376 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm|\.htaccess)$')
378 if options.is_all:
379 tests.append(LanguageIC())
380 tests.append(Entropy())
381 tests.append(LongestWord())
382 tests.append(SignatureNasty())
383 else:
384 if options.is_entropy:
385 tests.append(Entropy())
386 if options.is_longest:
387 tests.append(LongestWord())
388 if options.is_ic:
389 tests.append(LanguageIC())
390 if options.is_signature:
391 tests.append(SignatureNasty())
392 if options.is_eval:
393 tests.append(UsesEval())
394 if options.is_zlib:
395 tests.append(Compression())
397 # Instantiate the Generator Class used for searching, opening, and reading files
398 locator = SearchFile()
400 # CSV file output array
401 csv_array = []
402 csv_header = ["filename"]
404 # Grab the file and calculate each test against file
405 fileCount = 0
406 fileIgnoreCount = 0
407 for data, filename in locator.search_file_path(args, valid_regex):
408 if data:
409 # a row array for the CSV
410 csv_row = []
411 csv_row.append(filename)
413 if options.ignore_unicode:
414 asciiHighCount = 0
415 for character in data:
416 if ord(character) > 127:
417 asciiHighCount = asciiHighCount + 1
419 fileAsciiHighRatio = float(asciiHighCount) / float(len(data))
421 if (options.ignore_unicode == False or fileAsciiHighRatio < .1):
422 for test in tests:
423 calculated_value = test.calculate(data, filename)
424 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
425 if len(csv_header) < len(tests) + 1:
426 csv_header.append(test.__class__.__name__)
427 csv_row.append(calculated_value)
428 fileCount = fileCount + 1
429 csv_array.append(csv_row)
430 else:
431 fileIgnoreCount = fileIgnoreCount + 1
433 if options.is_csv:
434 csv_array.insert(0,csv_header)
435 fileOutput = csv.writer(open(options.is_csv, "wb"))
436 fileOutput.writerows(csv_array)
438 timeFinish = time.clock()
440 # Print some stats
441 print "\n[[ Total files scanned: %i ]]" % (fileCount)
442 print "[[ Total files ignored: %i ]]" % (fileIgnoreCount)
443 print "[[ Scan Time: %f seconds ]]" % (timeFinish - timeStart)
445 # Print top rank lists
446 rank_list = {}
447 for test in tests:
448 test.sort()
449 test.printer(10)
450 for file in test.results:
451 rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
453 rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
455 print "\n[[ Top cumulative ranked files ]]"
456 count = 10
457 if (count > len(rank_sorted)): count = len(rank_sorted)
458 for x in range(count):
459 print ' {0:>7} {1}'.format(rank_sorted[x][1], rank_sorted[x][0])