3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
10 # Try catch regular expressions/bad path/bad filename/bad regex/
19 from collections
import defaultdict
20 from optparse
import OptionParser
23 """Class that calculates a file's Index of Coincidence as
24 as well as a a subset of files average Index of Coincidence.
27 """Initialize results arrays as well as character counters."""
28 self
.char_count
= defaultdict(int)
29 self
.total_char_count
= 0
31 self
.ic_total_results
= ""
33 def caculate_char_count(self
,data
):
34 """Method to calculate character counts for a particular data file."""
40 charcount
= data
.count(char
)
41 self
.char_count
[char
] += charcount
42 self
.total_char_count
+= charcount
46 def caculate_IC(self
):
47 """Calculate the Index of Coincidence for the self variables"""
49 for val
in self
.char_count
.values():
53 total
+= val
* (val
-1)
56 ic_total
= float(total
)/(self
.total_char_count
* (self
.total_char_count
- 1))
59 self
.ic_total_results
= ic_total
62 def caculate(self
,data
,filename
):
63 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
71 charcount
= data
.count(char
)
72 char_count
+= charcount
* (charcount
- 1)
73 total_char_count
+= charcount
75 ic
= float(char_count
)/(total_char_count
* (total_char_count
- 1))
76 self
.ic_results
.append({"filename":filename
, "IC":ic
})
77 # Call method to caculate_char_count and append to total_char_count
78 self
.caculate_char_count(data
)
82 """Print the average IC for searchpath and the top 10 lowest Index of Coincidence files."""
83 self
.ic_results
.sort(key
=lambda item
: item
["IC"])
84 top_ten
= self
.ic_results
[0:10]
85 # Calculate the Total IC for a Search
89 print "[[ Average IC for Search ]]"
90 print self
.ic_total_results
92 print "[[ Top 10 IC files ]]"
95 print ' {0:>7.4f} {1}'.format(file["IC"], file["filename"])
96 results
= file["filename"], x
97 ic_list
.append(results
)
102 """Class that calculates a file's Entropy."""
105 """Instantiate the entropy_results array."""
106 self
.entropy_results
= []
108 def caculate(self
,data
,filename
):
109 """Calculate the entropy for 'data' and append result to entropy_results array."""
115 p_x
= float(data
.count(chr(x
)))/len(data
)
117 entropy
+= - p_x
* math
.log(p_x
, 2)
118 self
.entropy_results
.append({"filename":filename
, "entropy":entropy
})
122 """Print the top 10 entropic files for a given search"""
123 self
.entropy_results
.sort(key
=lambda item
: item
["entropy"])
124 top_ten
= self
.entropy_results
[-10:]
129 print "[[ Top 10 entropic files ]]"
132 print ' {0:>7.4f} {1}'.format(file["entropy"], file["filename"])
133 results
= file["filename"], x
134 entropy_list
.append(results
)
139 """Class that determines the longest word for a particular file."""
141 """Instantiate the longestword_results array."""
142 self
.longestword_results
= []
144 def caculate(self
,data
,filename
):
145 """Find the longest word in a string and append to longestword_results array"""
152 words
= re
.split("[\s,\n,\r]", data
)
159 self
.longestword_results
.append({"filename":filename
, "wordlongest":longest
})
163 """Print the top 10 longest word files for a given search"""
164 self
.longestword_results
.sort(key
=lambda item
: item
["wordlongest"])
165 top_ten
= self
.longestword_results
[-10:]
167 longestword_list
= []
170 print "[[ Top 10 longest word files ]]"
173 print ' {0:>7} {1}'.format(file["wordlongest"], file["filename"])
174 results
= file["filename"], x
175 longestword_list
.append(results
)
177 return longestword_list
180 """Generator that searches a given filepath with an optional regular
181 expression and returns the filepath and filename"""
182 def search_file_path(self
, args
, valid_regex
):
183 for root
, dirs
, files
in os
.walk(args
[0]):
185 filename
= os
.path
.join(root
, file)
186 if (valid_regex
.search(file) and os
.path
.getsize(filename
) > 60):
188 data
= open(root
+ "/" + file, 'rb').read()
191 print "Could not read file :: %s/%s" % (root
, file)
195 def print_rank(self
, top_ten
):
197 files
= defaultdict(int)
199 for file, rank
in list:
200 files
[str(file)] += int(rank
)
202 sorted_top_ten
= sorted(files
.items(), key
=lambda k
: k
[1], reverse
=True)
203 top_ten
= sorted_top_ten
[0:10]
204 print "[[ Highest Rank Files Based on test results ]]"
205 # print ' {0:>7} {1}'.format("Rank", "Filename")
208 #print file[0], "%" +
209 print ' {0:>7} {1}'.format(str(int((float(file[1])/30) * 100)) + "%", file[0])
213 if __name__
== "__main__":
214 """Parse all the options"""
215 parser
= OptionParser(usage
="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
217 parser
.add_option("-C", "--csv",
221 help="generate CSV outfile",
223 parser
.add_option("-a", "--all",
227 help="Run all tests [Entropy, Longest Word, Compression]",)
228 parser
.add_option("-e", "--entropy",
232 help="Run entropy Test",)
233 parser
.add_option("-l", "--longestword",
237 help="Run longest word test",)
238 parser
.add_option("-c", "--ic",
243 parser
.add_option("-A", "--auto",
247 help="Run auto file extension tests",)
249 (options
, args
) = parser
.parse_args()
251 # Error on invalid number of arguements
253 parser
.error("wrong number of arguments")
255 # Error on an invalid path
256 if os
.path
.exists(args
[0]) == False:
257 parser
.error("Invalid path")
260 if (len(args
) == 2 and options
.is_auto
is False):
262 valid_regex
= re
.compile(args
[1])
264 parser
.error("Invalid regular expression")
266 valid_regex
= re
.compile('.*')
270 valid_regex
= re
.compile('(\.php|\.asp|\.aspx|\.sh|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm)$')
273 tests
.append(LanguageIC())
274 tests
.append(Entropy())
275 tests
.append(LongestWord())
277 if options
.is_entropy
:
278 tests
.append(Entropy())
280 if options
.is_longest
:
281 tests
.append(LongestWord())
284 tests
.append(LanguageIC())
286 # Instantiate the Generator Class used for searching, opening, and reading files
287 locator
= SearchFile()
289 # CSV file output array
291 csv_header
= ["filename"]
293 # Grab the file and calculate each test against file
294 for data
,filename
in locator
.search_file_path(args
, valid_regex
):
296 # a row array for the CSV
298 csv_row
.append(filename
)
300 calculated_value
= test
.caculate(data
,filename
)
301 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
302 if len(csv_header
) < len(tests
) + 1:
303 csv_header
.append(test
.__class
__.__name
__)
304 csv_row
.append(calculated_value
)
305 csv_array
.append(csv_row
)
308 csv_array
.insert(0,csv_header
)
309 fileOutput
= csv
.writer(open(options
.is_csv
, "wb"))
310 fileOutput
.writerows(csv_array
)
313 # For each test print the top ten results for that test.
315 top_ten
.append(test
.printer())
318 printer
= PrintRank()
320 printer
.print_rank(top_ten
)