3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
8 # Copyright: Neohapsis Open Source blah Blah
11 # Try catch regular expressions/bad path/bad filename/bad regex/
20 from collections
import defaultdict
21 from optparse
import OptionParser
24 """Class that calculates a file's Index of Coincidence as
25 as well as a a subset of files average Index of Coincidence.
28 """Initialize results arrays as well as character counters."""
29 self
.char_count
= defaultdict(int)
30 self
.total_char_count
= 0
32 self
.ic_total_results
= ""
34 def caculate_char_count(self
,data
):
35 """Method to calculate character counts for a particular data file."""
41 charcount
= data
.count(char
)
42 self
.char_count
[char
] += charcount
43 self
.total_char_count
+= charcount
47 def caculate_IC(self
):
48 """Calculate the Index of Coincidence for the self variables"""
50 for val
in self
.char_count
.values():
54 total
+= val
* (val
-1)
57 ic_total
= float(total
)/(self
.total_char_count
* (self
.total_char_count
- 1))
60 self
.ic_total_results
= ic_total
63 def caculate(self
,data
,filename
):
64 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
72 charcount
= data
.count(char
)
73 char_count
+= charcount
* (charcount
- 1)
74 total_char_count
+= charcount
76 ic
= float(char_count
)/(total_char_count
* (total_char_count
- 1))
77 self
.ic_results
.append({"filename":filename
, "IC":ic
})
78 # Call method to caculate_char_count and append to total_char_count
79 self
.caculate_char_count(data
)
83 """Print the average IC for searchpath and the top 10 lowest Index of Coincidence files."""
84 self
.ic_results
.sort(key
=lambda item
: item
["IC"])
85 top_ten
= self
.ic_results
[0:10]
86 # Calculate the Total IC for a Search
90 print "[[ Average IC for Search ]]"
91 print self
.ic_total_results
93 print "[[ Top 10 IC files ]]"
96 print ' {0:>7.4f} {1}'.format(file["IC"], file["filename"])
97 results
= file["filename"], x
98 ic_list
.append(results
)
103 """Class that calculates a file's Entropy."""
106 """Instantiate the entropy_results array."""
107 self
.entropy_results
= []
109 def caculate(self
,data
,filename
):
110 """Calculate the entropy for 'data' and append result to entropy_results array."""
116 p_x
= float(data
.count(chr(x
)))/len(data
)
118 entropy
+= - p_x
* math
.log(p_x
, 2)
119 self
.entropy_results
.append({"filename":filename
, "entropy":entropy
})
123 """Print the top 10 entropic files for a given search"""
124 self
.entropy_results
.sort(key
=lambda item
: item
["entropy"])
125 top_ten
= self
.entropy_results
[-10:]
130 print "[[ Top 10 entropic files ]]"
133 print ' {0:>7.4f} {1}'.format(file["entropy"], file["filename"])
134 results
= file["filename"], x
135 entropy_list
.append(results
)
140 """Class that determines the longest word for a particular file."""
142 """Instantiate the longestword_results array."""
143 self
.longestword_results
= []
145 def caculate(self
,data
,filename
):
146 """Find the longest word in a string and append to longestword_results array"""
153 words
= re
.split("[\s,\n,\r]", data
)
160 self
.longestword_results
.append({"filename":filename
, "wordlongest":longest
})
164 """Print the top 10 longest word files for a given search"""
165 self
.longestword_results
.sort(key
=lambda item
: item
["wordlongest"])
166 top_ten
= self
.longestword_results
[-10:]
168 longestword_list
= []
171 print "[[ Top 10 longest word files ]]"
174 print ' {0:>7} {1}'.format(file["wordlongest"], file["filename"])
175 results
= file["filename"], x
176 longestword_list
.append(results
)
178 return longestword_list
181 """Generator that searches a given filepath with an optional regular
182 expression and returns the filepath and filename"""
183 def search_file_path(self
, args
, valid_regex
):
184 for root
, dirs
, files
in os
.walk(args
[0]):
186 filename
= os
.path
.join(root
, file)
187 if (valid_regex
.search(file) and os
.path
.getsize(filename
) > 60):
189 data
= open(root
+ "/" + file, 'rb').read()
192 print "Could not read file :: %s/%s" % (root
, file)
196 def print_rank(self
, top_ten
):
198 files
= defaultdict(int)
200 for file, rank
in list:
201 files
[str(file)] += int(rank
)
203 sorted_top_ten
= sorted(files
.items(), key
=lambda k
: k
[1], reverse
=True)
204 top_ten
= sorted_top_ten
[0:10]
205 print "[[ Highest Rank Files Based on test results ]]"
206 # print ' {0:>7} {1}'.format("Rank", "Filename")
209 #print file[0], "%" +
210 print ' {0:>7} {1}'.format(str(int((float(file[1])/30) * 100)) + "%", file[0])
214 if __name__
== "__main__":
215 """Parse all the options"""
216 parser
= OptionParser(usage
="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
218 parser
.add_option("-C", "--csv",
222 help="generate CSV outfile",
224 parser
.add_option("-a", "--all",
228 help="Run all tests [Entropy, Longest Word, Compression]",)
229 parser
.add_option("-e", "--entropy",
233 help="Run entropy Test",)
234 parser
.add_option("-l", "--longestword",
238 help="Run longest word test",)
239 parser
.add_option("-c", "--ic",
244 parser
.add_option("-A", "--auto",
248 help="Run auto file extension tests",)
250 (options
, args
) = parser
.parse_args()
252 # Error on invalid number of arguements
254 parser
.error("wrong number of arguments")
256 # Error on an invalid path
257 if os
.path
.exists(args
[0]) == False:
258 parser
.error("invalid path")
261 if (len(args
) == 2 and options
.is_auto
is False):
262 valid_regex
= re
.compile(args
[1])
264 valid_regex
= re
.compile('.*')
268 valid_regex
= re
.compile('(\.php|\.asp|\.aspx|\.sh|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm)$')
271 tests
.append(LanguageIC())
272 tests
.append(Entropy())
273 tests
.append(LongestWord())
275 if options
.is_entropy
:
276 tests
.append(Entropy())
278 if options
.is_longest
:
279 tests
.append(LongestWord())
282 tests
.append(LanguageIC())
284 # Instantiate the Generator Class used for searching, opening, and reading files
285 locator
= SearchFile()
287 # CSV file output array
289 csv_header
= ["filename"]
291 # Grab the file and calculate each test against file
292 for data
,filename
in locator
.search_file_path(args
, valid_regex
):
294 # a row array for the CSV
296 csv_row
.append(filename
)
298 calculated_value
= test
.caculate(data
,filename
)
299 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
300 if len(csv_header
) < len(tests
) + 1:
301 csv_header
.append(test
.__class
__.__name
__)
302 csv_row
.append(calculated_value
)
303 csv_array
.append(csv_row
)
306 csv_array
.insert(0,csv_header
)
307 fileOutput
= csv
.writer(open(options
.is_csv
, "wb"))
308 fileOutput
.writerows(csv_array
)
311 # For each test print the top ten results for that test.
313 top_ten
.append(test
.printer())
316 printer
= PrintRank()
318 printer
.print_rank(top_ten
)