Edited neopi.py via GitHub
[NeoPI.git] / neopi.py
blob144e1b61e5d224abad262c246fed2e630a2a0f9a
1 #!/usr/bin/python
2 # Name: neopi.py
3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
7 # Date: 11/4/2010
10 # Try catch regular expressions/bad path/bad filename/bad regex/
12 # Library imports
13 import math
14 import sys
15 import os
16 import re
17 import zlib
18 import csv
19 from collections import defaultdict
20 from optparse import OptionParser
22 class LanguageIC:
23 """Class that calculates a file's Index of Coincidence as
24 as well as a a subset of files average Index of Coincidence.
25 """
26 def __init__(self):
27 """Initialize results arrays as well as character counters."""
28 self.char_count = defaultdict(int)
29 self.total_char_count = 0
30 self.ic_results = []
31 self.ic_total_results = ""
33 def caculate_char_count(self,data):
34 """Method to calculate character counts for a particular data file."""
35 if not data:
36 return 0
38 for x in range(256):
39 char = chr(x)
40 charcount = data.count(char)
41 self.char_count[char] += charcount
42 self.total_char_count += charcount
44 return
46 def caculate_IC(self):
47 """Calculate the Index of Coincidence for the self variables"""
48 total = 0
49 for val in self.char_count.values():
51 if val == 0:
52 continue
53 total += val * (val-1)
55 try:
56 ic_total = float(total)/(self.total_char_count * (self.total_char_count - 1))
57 except:
58 ic_total = 0
59 self.ic_total_results = ic_total
60 return
62 def caculate(self,data,filename):
63 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
64 if not data:
65 return 0
66 char_count = 0
67 total_char_count = 0
69 for x in range(256):
70 char = chr(x)
71 charcount = data.count(char)
72 char_count += charcount * (charcount - 1)
73 total_char_count += charcount
75 ic = float(char_count)/(total_char_count * (total_char_count - 1))
76 self.ic_results.append({"filename":filename, "IC":ic})
77 # Call method to caculate_char_count and append to total_char_count
78 self.caculate_char_count(data)
79 return ic
81 def printer(self):
82 """Print the average IC for searchpath and the top 10 lowest Index of Coincidence files."""
83 self.ic_results.sort(key=lambda item: item["IC"])
84 top_ten = self.ic_results[0:10]
85 # Calculate the Total IC for a Search
86 self.caculate_IC()
87 ic_list = []
88 print ""
89 print "[[ Average IC for Search ]]"
90 print self.ic_total_results
91 print ""
92 print "[[ Top 10 IC files ]]"
93 x = 9
94 for file in top_ten:
95 print ' {0:>7.4f} {1}'.format(file["IC"], file["filename"])
96 results = file["filename"], x
97 ic_list.append(results)
98 x = x - 1
99 return ic_list
101 class Entropy:
102 """Class that calculates a file's Entropy."""
104 def __init__(self):
105 """Instantiate the entropy_results array."""
106 self.entropy_results = []
108 def caculate(self,data,filename):
109 """Calculate the entropy for 'data' and append result to entropy_results array."""
111 if not data:
112 return 0
113 entropy = 0
114 for x in range(256):
115 p_x = float(data.count(chr(x)))/len(data)
116 if p_x > 0:
117 entropy += - p_x * math.log(p_x, 2)
118 self.entropy_results.append({"filename":filename, "entropy":entropy})
119 return entropy
121 def printer(self):
122 """Print the top 10 entropic files for a given search"""
123 self.entropy_results.sort(key=lambda item: item["entropy"])
124 top_ten = self.entropy_results[-10:]
125 top_ten.reverse()
126 entropy_list = []
128 print ""
129 print "[[ Top 10 entropic files ]]"
130 x = 9
131 for file in top_ten:
132 print ' {0:>7.4f} {1}'.format(file["entropy"], file["filename"])
133 results = file["filename"], x
134 entropy_list.append(results)
135 x = x - 1
136 return entropy_list
138 class LongestWord:
139 """Class that determines the longest word for a particular file."""
140 def __init__(self):
141 """Instantiate the longestword_results array."""
142 self.longestword_results = []
144 def caculate(self,data,filename):
145 """Find the longest word in a string and append to longestword_results array"""
147 if not data:
148 return "", 0
150 longest = 0
151 longest_word = ""
152 words = re.split("[\s,\n,\r]", data)
153 if words:
154 for word in words:
155 length = len(word)
156 if length > longest:
157 longest = length
158 longest_word = word
159 self.longestword_results.append({"filename":filename, "wordlongest":longest})
160 return longest
162 def printer(self):
163 """Print the top 10 longest word files for a given search"""
164 self.longestword_results.sort(key=lambda item: item["wordlongest"])
165 top_ten = self.longestword_results[-10:]
166 top_ten.reverse()
167 longestword_list = []
169 print ""
170 print "[[ Top 10 longest word files ]]"
171 x = 9
172 for file in top_ten:
173 print ' {0:>7} {1}'.format(file["wordlongest"], file["filename"])
174 results = file["filename"], x
175 longestword_list.append(results)
176 x = x - 1
177 return longestword_list
179 class SearchFile:
180 """Generator that searches a given filepath with an optional regular
181 expression and returns the filepath and filename"""
182 def search_file_path(self, args, valid_regex):
183 for root, dirs, files in os.walk(args[0]):
184 for file in files:
185 filename = os.path.join(root, file)
186 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
187 try:
188 data = open(root + "/" + file, 'rb').read()
189 except:
190 data = False
191 print "Could not read file :: %s/%s" % (root, file)
192 yield data, filename
193 class PrintRank:
194 """bob"""
195 def print_rank(self, top_ten):
197 files = defaultdict(int)
198 for list in top_ten:
199 for file, rank in list:
200 files[str(file)] += int(rank)
202 sorted_top_ten = sorted(files.items(), key=lambda k: k[1], reverse=True)
203 top_ten = sorted_top_ten[0:10]
204 print "[[ Highest Rank Files Based on test results ]]"
205 # print ' {0:>7} {1}'.format("Rank", "Filename")
207 for file in top_ten:
208 #print file[0], "%" +
209 print ' {0:>7} {1}'.format(str(int((float(file[1])/30) * 100)) + "%", file[0])
211 return
213 if __name__ == "__main__":
214 """Parse all the options"""
215 parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
216 version="%prog 1.0")
217 parser.add_option("-C", "--csv",
218 action="store",
219 dest="is_csv",
220 default=False,
221 help="generate CSV outfile",
222 metavar="FILECSV")
223 parser.add_option("-a", "--all",
224 action="store_true",
225 dest="is_all",
226 default=False,
227 help="Run all tests [Entropy, Longest Word, Compression]",)
228 parser.add_option("-e", "--entropy",
229 action="store_true",
230 dest="is_entropy",
231 default=False,
232 help="Run entropy Test",)
233 parser.add_option("-l", "--longestword",
234 action="store_true",
235 dest="is_longest",
236 default=False,
237 help="Run longest word test",)
238 parser.add_option("-c", "--ic",
239 action="store_true",
240 dest="is_ic",
241 default=False,
242 help="Run IC test",)
243 parser.add_option("-A", "--auto",
244 action="store_true",
245 dest="is_auto",
246 default=False,
247 help="Run auto file extension tests",)
249 (options, args) = parser.parse_args()
251 # Error on invalid number of arguements
252 if len(args) < 1:
253 parser.error("wrong number of arguments")
255 # Error on an invalid path
256 if os.path.exists(args[0]) == False:
257 parser.error("Invalid path")
259 valid_regex = ""
260 if (len(args) == 2 and options.is_auto is False):
261 try:
262 valid_regex = re.compile(args[1])
263 except:
264 parser.error("Invalid regular expression")
265 else:
266 valid_regex = re.compile('.*')
267 tests = []
269 if options.is_auto:
270 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.sh|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm)$')
272 if options.is_all:
273 tests.append(LanguageIC())
274 tests.append(Entropy())
275 tests.append(LongestWord())
276 else:
277 if options.is_entropy:
278 tests.append(Entropy())
280 if options.is_longest:
281 tests.append(LongestWord())
283 if options.is_ic:
284 tests.append(LanguageIC())
286 # Instantiate the Generator Class used for searching, opening, and reading files
287 locator = SearchFile()
289 # CSV file output array
290 csv_array = []
291 csv_header = ["filename"]
293 # Grab the file and calculate each test against file
294 for data,filename in locator.search_file_path(args, valid_regex):
295 if data:
296 # a row array for the CSV
297 csv_row = []
298 csv_row.append(filename)
299 for test in tests:
300 calculated_value = test.caculate(data,filename)
301 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
302 if len(csv_header) < len(tests) + 1:
303 csv_header.append(test.__class__.__name__)
304 csv_row.append(calculated_value)
305 csv_array.append(csv_row)
307 if options.is_csv:
308 csv_array.insert(0,csv_header)
309 fileOutput = csv.writer(open(options.is_csv, "wb"))
310 fileOutput.writerows(csv_array)
312 top_ten = []
313 # For each test print the top ten results for that test.
314 for test in tests:
315 top_ten.append(test.printer())
316 print ""
318 printer = PrintRank()
320 printer.print_rank(top_ten)