Help Changes
[fixup.git] / neopi.py
blob3d418d85307472ccc60e5efd7596479104c73ae4
1 #!/usr/bin/python
2 # Name: neopi.py
3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
7 # Date: 11/4/2010
9 # pep-0008 - Is stupid. TABS FO'EVER!
11 # Try catch regular expressions/bad path/bad filename/bad regex/
13 # Library imports
14 import math
15 import sys
16 import os
17 import re
18 import csv
19 from collections import defaultdict
20 from optparse import OptionParser
22 class LanguageIC:
23 """Class that calculates a file's Index of Coincidence as
24 as well as a a subset of files average Index of Coincidence.
25 """
26 def __init__(self):
27 """Initialize results arrays as well as character counters."""
28 self.char_count = defaultdict(int)
29 self.total_char_count = 0
30 self.results = []
31 self.ic_total_results = ""
33 def calculate_char_count(self,data):
34 """Method to calculate character counts for a particular data file."""
35 if not data:
36 return 0
38 for x in range(256):
39 char = chr(x)
40 charcount = data.count(char)
41 self.char_count[char] += charcount
42 self.total_char_count += charcount
44 return
46 def calculate_IC(self):
47 """Calculate the Index of Coincidence for the self variables"""
48 total = 0
49 for val in self.char_count.values():
51 if val == 0:
52 continue
53 total += val * (val-1)
55 try:
56 ic_total = float(total)/(self.total_char_count * (self.total_char_count - 1))
57 except:
58 ic_total = 0
59 self.ic_total_results = ic_total
60 return
62 def calculate(self,data,filename):
63 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
64 if not data:
65 return 0
66 char_count = 0
67 total_char_count = 0
69 for x in range(256):
70 char = chr(x)
71 charcount = data.count(char)
72 char_count += charcount * (charcount - 1)
73 total_char_count += charcount
75 ic = float(char_count)/(total_char_count * (total_char_count - 1))
76 self.results.append({"filename":filename, "value":ic})
77 # Call method to calculate_char_count and append to total_char_count
78 self.calculate_char_count(data)
79 return ic
81 def sort(self):
82 self.results.sort(key=lambda item: item["value"])
83 self.results = resultsAddRank(self.results)
85 def printer(self, count):
86 """Print the top signature count match files for a given search"""
87 # Calculate the Total IC for a Search
88 self.calculate_IC()
89 print "\n[[ Average IC for Search ]]"
90 print self.ic_total_results
91 print "\n[[ Top %i lowest IC files ]]" % (count)
92 for x in range(count):
93 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
94 return
96 class Entropy:
97 """Class that calculates a file's Entropy."""
99 def __init__(self):
100 """Instantiate the entropy_results array."""
101 self.results = []
103 def calculate(self,data,filename):
104 """Calculate the entropy for 'data' and append result to entropy_results array."""
106 if not data:
107 return 0
108 entropy = 0
109 for x in range(256):
110 p_x = float(data.count(chr(x)))/len(data)
111 if p_x > 0:
112 entropy += - p_x * math.log(p_x, 2)
113 self.results.append({"filename":filename, "value":entropy})
114 return entropy
116 def sort(self):
117 self.results.sort(key=lambda item: item["value"])
118 self.results.reverse()
119 self.results = resultsAddRank(self.results)
121 def printer(self, count):
122 """Print the top signature count match files for a given search"""
123 print "\n[[ Top %i entropic files for a given search ]]" % (count)
124 for x in range(count):
125 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
126 return
128 class LongestWord:
129 """Class that determines the longest word for a particular file."""
130 def __init__(self):
131 """Instantiate the longestword_results array."""
132 self.results = []
134 def calculate(self,data,filename):
135 """Find the longest word in a string and append to longestword_results array"""
136 if not data:
137 return "", 0
138 longest = 0
139 longest_word = ""
140 words = re.split("[\s,\n,\r]", data)
141 if words:
142 for word in words:
143 length = len(word)
144 if length > longest:
145 longest = length
146 longest_word = word
147 self.results.append({"filename":filename, "value":longest})
148 return longest
150 def sort(self):
151 self.results.sort(key=lambda item: item["value"])
152 self.results.reverse()
153 self.results = resultsAddRank(self.results)
155 def printer(self, count):
156 """Print the top signature count match files for a given search"""
157 print "\n[[ Top %i longest word files ]]" % (count)
158 for x in range(count):
159 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
160 return
162 class SignatureNasty:
163 """Generator that searches a given file for nasty expressions"""
165 def __init__(self):
166 """Instantiate the longestword_results array."""
167 self.results = []
169 def calculate(self, data, filename):
170 if not data:
171 return "", 0
172 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
173 valid_regex = re.compile('(eval\(|base64_decode|python_eval|exec\(|passthru\(|popen\(|proc_open\(|pcntl_|assert\()')
174 matches = re.findall(valid_regex, data)
175 self.results.append({"filename":filename, "value":len(matches)})
176 return len(matches)
178 def sort(self):
179 self.results.sort(key=lambda item: item["value"])
180 self.results.reverse()
181 self.results = resultsAddRank(self.results)
183 def printer(self, count):
184 """Print the top signature count match files for a given search"""
185 print "\n[[ Top %i signature match counts ]]" % (count)
186 for x in range(count):
187 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
188 return
190 def resultsAddRank(results):
191 rank = 1
192 offset = 1
193 previousValue = False
194 newList = []
195 for file in results:
196 if (previousValue and previousValue != file["value"]):
197 rank = offset
198 file["rank"] = rank
199 newList.append(file)
200 previousValue = file["value"]
201 offset = offset + 1
202 return newList
204 class SearchFile:
205 """Generator that searches a given filepath with an optional regular
206 expression and returns the filepath and filename"""
207 def search_file_path(self, args, valid_regex):
208 for root, dirs, files in os.walk(args[0]):
209 for file in files:
210 filename = os.path.join(root, file)
211 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
212 try:
213 data = open(root + "/" + file, 'rb').read()
214 except:
215 data = False
216 print "Could not read file :: %s/%s" % (root, file)
217 yield data, filename
219 if __name__ == "__main__":
220 """Parse all the options"""
221 parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
222 version="%prog 1.0")
223 parser.add_option("-c", "--csv",
224 action="store",
225 dest="is_csv",
226 default=False,
227 help="generate CSV outfile",
228 metavar="FILECSV")
229 parser.add_option("-a", "--all",
230 action="store_true",
231 dest="is_all",
232 default=False,
233 help="Run all tests [Entropy, Longest Word, IC, Signature]",)
234 parser.add_option("-e", "--entropy",
235 action="store_true",
236 dest="is_entropy",
237 default=False,
238 help="Run entropy Test",)
239 parser.add_option("-l", "--longestword",
240 action="store_true",
241 dest="is_longest",
242 default=False,
243 help="Run longest word test",)
244 parser.add_option("-i", "--ic",
245 action="store_true",
246 dest="is_ic",
247 default=False,
248 help="Run IC test",)
249 parser.add_option("-s", "--signature",
250 action="store_true",
251 dest="is_signature",
252 default=False,
253 help="Run signature test",)
254 parser.add_option("-A", "--auto",
255 action="store_true",
256 dest="is_auto",
257 default=False,
258 help="Run auto file extension tests",)
260 (options, args) = parser.parse_args()
262 # Error on invalid number of arguements
263 if len(args) < 1:
264 parser.error("Wrong number of arguments")
266 # Error on an invalid path
267 if os.path.exists(args[0]) == False:
268 parser.error("Invalid path")
270 valid_regex = ""
271 if (len(args) == 2 and options.is_auto is False):
272 try:
273 valid_regex = re.compile(args[1])
274 except:
275 parser.error("Invalid regular expression")
276 else:
277 valid_regex = re.compile('.*')
278 tests = []
280 if options.is_auto:
281 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm)$')
283 if options.is_all:
284 tests.append(LanguageIC())
285 tests.append(Entropy())
286 tests.append(LongestWord())
287 tests.append(SignatureNasty())
288 else:
289 if options.is_entropy:
290 tests.append(Entropy())
291 if options.is_longest:
292 tests.append(LongestWord())
293 if options.is_ic:
294 tests.append(LanguageIC())
295 if options.is_signature:
296 tests.append(SignatureNasty())
298 # Instantiate the Generator Class used for searching, opening, and reading files
299 locator = SearchFile()
301 # CSV file output array
302 csv_array = []
303 csv_header = ["filename"]
305 # Grab the file and calculate each test against file
306 for data, filename in locator.search_file_path(args, valid_regex):
307 if data:
308 # a row array for the CSV
309 csv_row = []
310 csv_row.append(filename)
311 for test in tests:
312 calculated_value = test.calculate(data, filename)
313 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
314 if len(csv_header) < len(tests) + 1:
315 csv_header.append(test.__class__.__name__)
316 csv_row.append(calculated_value)
317 csv_array.append(csv_row)
319 if options.is_csv:
320 csv_array.insert(0,csv_header)
321 fileOutput = csv.writer(open(options.is_csv, "wb"))
322 fileOutput.writerows(csv_array)
324 # Print top rank lists
325 rank_list = {}
326 for test in tests:
327 test.sort()
328 test.printer(10)
329 for file in test.results:
330 rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
332 rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
334 print "\n[[ Top cumulative ranked files ]]"
335 for x in range(10):
336 print ' {0:>7} {1}'.format(rank_sorted[x][1], rank_sorted[x][0])