3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
9 # pep-0008 - Is stupid. TABS FO'EVER!
11 # Try catch regular expressions/bad path/bad filename/bad regex/
21 from collections
import defaultdict
22 from optparse
import OptionParser
25 """Class that calculates a file's Index of Coincidence as
26 as well as a a subset of files average Index of Coincidence.
29 """Initialize results arrays as well as character counters."""
30 self
.char_count
= defaultdict(int)
31 self
.total_char_count
= 0
33 self
.ic_total_results
= ""
35 def calculate_char_count(self
,data
):
36 """Method to calculate character counts for a particular data file."""
41 charcount
= data
.count(char
)
42 self
.char_count
[char
] += charcount
43 self
.total_char_count
+= charcount
46 def calculate_IC(self
):
47 """Calculate the Index of Coincidence for the self variables"""
49 for val
in self
.char_count
.values():
53 total
+= val
* (val
-1)
56 ic_total
= float(total
)/(self
.total_char_count
* (self
.total_char_count
- 1))
59 self
.ic_total_results
= ic_total
62 def calculate(self
,data
,filename
):
63 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
71 charcount
= data
.count(char
)
72 char_count
+= charcount
* (charcount
- 1)
73 total_char_count
+= charcount
75 ic
= float(char_count
)/(total_char_count
* (total_char_count
- 1))
76 self
.results
.append({"filename":filename
, "value":ic
})
77 # Call method to calculate_char_count and append to total_char_count
78 self
.calculate_char_count(data
)
82 self
.results
.sort(key
=lambda item
: item
["value"])
83 self
.results
= resultsAddRank(self
.results
)
85 def printer(self
, count
):
86 """Print the top signature count match files for a given search"""
87 # Calculate the Total IC for a Search
89 print "\n[[ Average IC for Search ]]"
90 print self
.ic_total_results
91 print "\n[[ Top %i lowest IC files ]]" % (count
)
92 if (count
> len(self
.results
)): count
= len(self
.results
)
93 for x
in range(count
):
94 print ' {0:>7.4f} {1}'.format(self
.results
[x
]["value"], self
.results
[x
]["filename"])
98 """Class that calculates a file's Entropy."""
101 """Instantiate the entropy_results array."""
104 def calculate(self
,data
,filename
):
105 """Calculate the entropy for 'data' and append result to entropy_results array."""
110 data
.replace(' ', '')
112 p_x
= float(data
.count(chr(x
)))/len(data
)
114 entropy
+= - p_x
* math
.log(p_x
, 2)
115 self
.results
.append({"filename":filename
, "value":entropy
})
119 self
.results
.sort(key
=lambda item
: item
["value"])
120 self
.results
.reverse()
121 self
.results
= resultsAddRank(self
.results
)
123 def printer(self
, count
):
124 """Print the top signature count match files for a given search"""
125 print "\n[[ Top %i entropic files for a given search ]]" % (count
)
126 if (count
> len(self
.results
)): count
= len(self
.results
)
127 for x
in range(count
):
128 print ' {0:>7.4f} {1}'.format(self
.results
[x
]["value"], self
.results
[x
]["filename"])
132 """Class that determines the longest word for a particular file."""
134 """Instantiate the longestword_results array."""
137 def calculate(self
,data
,filename
):
138 """Find the longest word in a string and append to longestword_results array"""
143 words
= re
.split("[\s,\n,\r]", data
)
150 self
.results
.append({"filename":filename
, "value":longest
})
154 self
.results
.sort(key
=lambda item
: item
["value"])
155 self
.results
.reverse()
156 self
.results
= resultsAddRank(self
.results
)
158 def printer(self
, count
):
159 """Print the top signature count match files for a given search"""
160 print "\n[[ Top %i longest word files ]]" % (count
)
161 if (count
> len(self
.results
)): count
= len(self
.results
)
162 for x
in range(count
):
163 print ' {0:>7} {1}'.format(self
.results
[x
]["value"], self
.results
[x
]["filename"])
166 class SignatureNasty
:
167 """Generator that searches a given file for nasty expressions"""
170 """Instantiate the longestword_results array."""
173 def calculate(self
, data
, filename
):
176 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
177 valid_regex
= re
.compile('(eval\(|base64_decode|python_eval|exec\(|passthru|popen|proc_open|pcntl|assert\(|system\(|shell)', re
.I
)
178 matches
= re
.findall(valid_regex
, data
)
179 self
.results
.append({"filename":filename
, "value":len(matches
)})
183 self
.results
.sort(key
=lambda item
: item
["value"])
184 self
.results
.reverse()
185 self
.results
= resultsAddRank(self
.results
)
187 def printer(self
, count
):
188 """Print the top signature count match files for a given search"""
189 print "\n[[ Top %i signature match counts ]]" % (count
)
190 if (count
> len(self
.results
)): count
= len(self
.results
)
191 for x
in range(count
):
192 print ' {0:>7} {1}'.format(self
.results
[x
]["value"], self
.results
[x
]["filename"])
196 """Generator finds compression ratio"""
199 """Instantiate the results array."""
202 def calculate(self
, data
, filename
):
205 compressed
= zlib
.compress(data
)
206 ratio
= float(len(compressed
)) / float(len(data
))
207 self
.results
.append({"filename":filename
, "value":ratio
})
211 self
.results
.sort(key
=lambda item
: item
["value"])
212 self
.results
.reverse()
213 self
.results
= resultsAddRank(self
.results
)
215 def printer(self
, count
):
216 """Print the top files for a given search"""
217 print "\n[[ Top %i compression match counts ]]" % (count
)
218 if (count
> len(self
.results
)): count
= len(self
.results
)
219 for x
in range(count
):
220 print ' {0:>7.4f} {1}'.format(self
.results
[x
]["value"], self
.results
[x
]["filename"])
223 def resultsAddRank(results
):
226 previousValue
= False
229 if (previousValue
and previousValue
!= file["value"]):
233 previousValue
= file["value"]
238 """Generator that searches a given filepath with an optional regular
239 expression and returns the filepath and filename"""
240 def search_file_path(self
, args
, valid_regex
):
241 for root
, dirs
, files
in os
.walk(args
[0]):
243 filename
= os
.path
.join(root
, file)
244 if (valid_regex
.search(file) and os
.path
.getsize(filename
) > 60):
246 data
= open(root
+ "/" + file, 'rb').read()
249 print "Could not read file :: %s/%s" % (root
, file)
252 if __name__
== "__main__":
253 """Parse all the options"""
255 timeStart
= time
.clock()
261 ((_)\ ))\ ( /(_))(_))
263 | \| (_)) ((_) _ \_ _|
265 |_|\_\___\___/_| |___| Ver. *.USEGIT
268 parser
= OptionParser(usage
="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
270 parser
.add_option("-c", "--csv",
274 help="generate CSV outfile",
276 parser
.add_option("-a", "--all",
280 help="Run all (useful) tests [Entropy, Longest Word, IC, Signature]",)
281 parser
.add_option("-z", "--zlib",
285 help="Run compression Test",)
286 parser
.add_option("-e", "--entropy",
290 help="Run entropy Test",)
291 parser
.add_option("-l", "--longestword",
295 help="Run longest word test",)
296 parser
.add_option("-i", "--ic",
301 parser
.add_option("-s", "--signature",
305 help="Run signature test",)
306 parser
.add_option("-A", "--auto",
310 help="Run auto file extension tests",)
311 parser
.add_option("-u", "--unicode",
313 dest
="ignore_unicode",
315 help="Skip over unicode-y/UTF'y files",)
317 (options
, args
) = parser
.parse_args()
319 # Error on invalid number of arguements
325 # Error on an invalid path
326 if os
.path
.exists(args
[0]) == False:
327 parser
.error("Invalid path")
330 if (len(args
) == 2 and options
.is_auto
is False):
332 valid_regex
= re
.compile(args
[1])
334 parser
.error("Invalid regular expression")
336 valid_regex
= re
.compile('.*')
340 valid_regex
= re
.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm|\.htaccess)$')
343 tests
.append(LanguageIC())
344 tests
.append(Entropy())
345 tests
.append(LongestWord())
346 tests
.append(SignatureNasty())
348 if options
.is_entropy
:
349 tests
.append(Entropy())
350 if options
.is_longest
:
351 tests
.append(LongestWord())
353 tests
.append(LanguageIC())
354 if options
.is_signature
:
355 tests
.append(SignatureNasty())
357 tests
.append(Compression())
359 # Instantiate the Generator Class used for searching, opening, and reading files
360 locator
= SearchFile()
362 # CSV file output array
364 csv_header
= ["filename"]
366 # Grab the file and calculate each test against file
369 for data
, filename
in locator
.search_file_path(args
, valid_regex
):
371 # a row array for the CSV
373 csv_row
.append(filename
)
375 if options
.ignore_unicode
:
377 for character
in data
:
378 if ord(character
) > 127:
379 asciiHighCount
= asciiHighCount
+ 1
381 fileAsciiHighRatio
= float(asciiHighCount
) / float(len(data
))
383 if (options
.ignore_unicode
== False or fileAsciiHighRatio
< .1):
385 calculated_value
= test
.calculate(data
, filename
)
386 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
387 if len(csv_header
) < len(tests
) + 1:
388 csv_header
.append(test
.__class
__.__name
__)
389 csv_row
.append(calculated_value
)
390 fileCount
= fileCount
+ 1
391 csv_array
.append(csv_row
)
393 fileIgnoreCount
= fileIgnoreCount
+ 1
396 csv_array
.insert(0,csv_header
)
397 fileOutput
= csv
.writer(open(options
.is_csv
, "wb"))
398 fileOutput
.writerows(csv_array
)
400 timeFinish
= time
.clock()
403 print "\n[[ Total files scanned: %i ]]" % (fileCount
)
404 print "[[ Total files ignored: %i ]]" % (fileIgnoreCount
)
405 print "[[ Scan Time: %f seconds ]]" % (timeFinish
- timeStart
)
407 # Print top rank lists
412 for file in test
.results
:
413 rank_list
[file["filename"]] = rank_list
.setdefault(file["filename"], 0) + file["rank"]
415 rank_sorted
= sorted(rank_list
.items(), key
=lambda x
: x
[1])
417 print "\n[[ Top cumulative ranked files ]]"
419 if (count
> len(rank_sorted
)): count
= len(rank_sorted
)
420 for x
in range(count
):
421 print ' {0:>7} {1}'.format(rank_sorted
[x
][1], rank_sorted
[x
][0])