Added .htaccess to regex
[fixup/fork.git] / neopi.py
blobb05f1bff21b121a1befc4a90b8ee975cbf3e2ea1
1 #!/usr/bin/python
2 # Name: neopi.py
3 # Description: Utility to scan a file path for encrypted and obfuscated files
4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
5 # Scott Behrens (scott.behrens@neohapsis.com)
7 # Date: 11/4/2010
9 # pep-0008 - Is stupid. TABS FO'EVER!
11 # Try catch regular expressions/bad path/bad filename/bad regex/
13 # Library imports
14 import math
15 import sys
16 import os
17 import re
18 import csv
19 import zlib
20 import time
21 from collections import defaultdict
22 from optparse import OptionParser
24 class LanguageIC:
25 """Class that calculates a file's Index of Coincidence as
26 as well as a a subset of files average Index of Coincidence.
27 """
28 def __init__(self):
29 """Initialize results arrays as well as character counters."""
30 self.char_count = defaultdict(int)
31 self.total_char_count = 0
32 self.results = []
33 self.ic_total_results = ""
35 def calculate_char_count(self,data):
36 """Method to calculate character counts for a particular data file."""
37 if not data:
38 return 0
39 for x in range(256):
40 char = chr(x)
41 charcount = data.count(char)
42 self.char_count[char] += charcount
43 self.total_char_count += charcount
44 return
46 def calculate_IC(self):
47 """Calculate the Index of Coincidence for the self variables"""
48 total = 0
49 for val in self.char_count.values():
51 if val == 0:
52 continue
53 total += val * (val-1)
55 try:
56 ic_total = float(total)/(self.total_char_count * (self.total_char_count - 1))
57 except:
58 ic_total = 0
59 self.ic_total_results = ic_total
60 return
62 def calculate(self,data,filename):
63 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
64 if not data:
65 return 0
66 char_count = 0
67 total_char_count = 0
69 for x in range(256):
70 char = chr(x)
71 charcount = data.count(char)
72 char_count += charcount * (charcount - 1)
73 total_char_count += charcount
75 ic = float(char_count)/(total_char_count * (total_char_count - 1))
76 self.results.append({"filename":filename, "value":ic})
77 # Call method to calculate_char_count and append to total_char_count
78 self.calculate_char_count(data)
79 return ic
81 def sort(self):
82 self.results.sort(key=lambda item: item["value"])
83 self.results = resultsAddRank(self.results)
85 def printer(self, count):
86 """Print the top signature count match files for a given search"""
87 # Calculate the Total IC for a Search
88 self.calculate_IC()
89 print "\n[[ Average IC for Search ]]"
90 print self.ic_total_results
91 print "\n[[ Top %i lowest IC files ]]" % (count)
92 if (count > len(self.results)): count = len(self.results)
93 for x in range(count):
94 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
95 return
97 class Entropy:
98 """Class that calculates a file's Entropy."""
100 def __init__(self):
101 """Instantiate the entropy_results array."""
102 self.results = []
104 def calculate(self,data,filename):
105 """Calculate the entropy for 'data' and append result to entropy_results array."""
107 if not data:
108 return 0
109 entropy = 0
110 for x in range(256):
111 p_x = float(data.count(chr(x)))/len(data)
112 if p_x > 0:
113 entropy += - p_x * math.log(p_x, 2)
114 self.results.append({"filename":filename, "value":entropy})
115 return entropy
117 def sort(self):
118 self.results.sort(key=lambda item: item["value"])
119 self.results.reverse()
120 self.results = resultsAddRank(self.results)
122 def printer(self, count):
123 """Print the top signature count match files for a given search"""
124 print "\n[[ Top %i entropic files for a given search ]]" % (count)
125 if (count > len(self.results)): count = len(self.results)
126 for x in range(count):
127 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
128 return
130 class LongestWord:
131 """Class that determines the longest word for a particular file."""
132 def __init__(self):
133 """Instantiate the longestword_results array."""
134 self.results = []
136 def calculate(self,data,filename):
137 """Find the longest word in a string and append to longestword_results array"""
138 if not data:
139 return "", 0
140 longest = 0
141 longest_word = ""
142 words = re.split("[\s,\n,\r]", data)
143 if words:
144 for word in words:
145 length = len(word)
146 if length > longest:
147 longest = length
148 longest_word = word
149 self.results.append({"filename":filename, "value":longest})
150 return longest
152 def sort(self):
153 self.results.sort(key=lambda item: item["value"])
154 self.results.reverse()
155 self.results = resultsAddRank(self.results)
157 def printer(self, count):
158 """Print the top signature count match files for a given search"""
159 print "\n[[ Top %i longest word files ]]" % (count)
160 if (count > len(self.results)): count = len(self.results)
161 for x in range(count):
162 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
163 return
165 class SignatureNasty:
166 """Generator that searches a given file for nasty expressions"""
168 def __init__(self):
169 """Instantiate the longestword_results array."""
170 self.results = []
172 def calculate(self, data, filename):
173 if not data:
174 return "", 0
175 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
176 valid_regex = re.compile('(eval\(|base64_decode|python_eval|exec\(|passthru|popen|proc_open|pcntl|assert\(|system\(|shell)', re.I)
177 matches = re.findall(valid_regex, data)
178 self.results.append({"filename":filename, "value":len(matches)})
179 return len(matches)
181 def sort(self):
182 self.results.sort(key=lambda item: item["value"])
183 self.results.reverse()
184 self.results = resultsAddRank(self.results)
186 def printer(self, count):
187 """Print the top signature count match files for a given search"""
188 print "\n[[ Top %i signature match counts ]]" % (count)
189 if (count > len(self.results)): count = len(self.results)
190 for x in range(count):
191 print ' {0:>7} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
192 return
194 class Compression:
195 """Generator finds compression ratio"""
197 def __init__(self):
198 """Instantiate the results array."""
199 self.results = []
201 def calculate(self, data, filename):
202 if not data:
203 return "", 0
204 compressed = zlib.compress(data)
205 ratio = float(len(compressed)) / float(len(data))
206 self.results.append({"filename":filename, "value":ratio})
207 return ratio
209 def sort(self):
210 self.results.sort(key=lambda item: item["value"])
211 self.results.reverse()
212 self.results = resultsAddRank(self.results)
214 def printer(self, count):
215 """Print the top files for a given search"""
216 print "\n[[ Top %i compression match counts ]]" % (count)
217 if (count > len(self.results)): count = len(self.results)
218 for x in range(count):
219 print ' {0:>7.4f} {1}'.format(self.results[x]["value"], self.results[x]["filename"])
220 return
222 def resultsAddRank(results):
223 rank = 1
224 offset = 1
225 previousValue = False
226 newList = []
227 for file in results:
228 if (previousValue and previousValue != file["value"]):
229 rank = offset
230 file["rank"] = rank
231 newList.append(file)
232 previousValue = file["value"]
233 offset = offset + 1
234 return newList
236 class SearchFile:
237 """Generator that searches a given filepath with an optional regular
238 expression and returns the filepath and filename"""
239 def search_file_path(self, args, valid_regex):
240 for root, dirs, files in os.walk(args[0]):
241 for file in files:
242 filename = os.path.join(root, file)
243 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
244 try:
245 data = open(root + "/" + file, 'rb').read()
246 except:
247 data = False
248 print "Could not read file :: %s/%s" % (root, file)
249 yield data, filename
251 if __name__ == "__main__":
252 """Parse all the options"""
254 timeStart = time.clock()
256 print """
257 ) ( (
258 ( /( )\ ))\ )
259 )\()) ( (()/(()/(
260 ((_)\ ))\ ( /(_))(_))
261 _((_)/((_))\(_))(_))
262 | \| (_)) ((_) _ \_ _|
263 | .` / -_) _ \ _/| |
264 |_|\_\___\___/_| |___| Ver. *.USEGIT
267 parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
268 version="%prog 1.0")
269 parser.add_option("-c", "--csv",
270 action="store",
271 dest="is_csv",
272 default=False,
273 help="generate CSV outfile",
274 metavar="FILECSV")
275 parser.add_option("-a", "--all",
276 action="store_true",
277 dest="is_all",
278 default=False,
279 help="Run all (useful) tests [Entropy, Longest Word, IC, Signature]",)
280 parser.add_option("-z", "--zlib",
281 action="store_true",
282 dest="is_zlib",
283 default=False,
284 help="Run compression Test",)
285 parser.add_option("-e", "--entropy",
286 action="store_true",
287 dest="is_entropy",
288 default=False,
289 help="Run entropy Test",)
290 parser.add_option("-l", "--longestword",
291 action="store_true",
292 dest="is_longest",
293 default=False,
294 help="Run longest word test",)
295 parser.add_option("-i", "--ic",
296 action="store_true",
297 dest="is_ic",
298 default=False,
299 help="Run IC test",)
300 parser.add_option("-s", "--signature",
301 action="store_true",
302 dest="is_signature",
303 default=False,
304 help="Run signature test",)
305 parser.add_option("-A", "--auto",
306 action="store_true",
307 dest="is_auto",
308 default=False,
309 help="Run auto file extension tests",)
310 parser.add_option("-u", "--unicode",
311 action="store_true",
312 dest="ignore_unicode",
313 default=False,
314 help="Skip over unicode-y/UTF'y files",)
316 (options, args) = parser.parse_args()
318 # Error on invalid number of arguements
319 if len(args) < 1:
320 parser.print_help()
321 print ""
322 sys.exit()
324 # Error on an invalid path
325 if os.path.exists(args[0]) == False:
326 parser.error("Invalid path")
328 valid_regex = ""
329 if (len(args) == 2 and options.is_auto is False):
330 try:
331 valid_regex = re.compile(args[1])
332 except:
333 parser.error("Invalid regular expression")
334 else:
335 valid_regex = re.compile('.*')
336 tests = []
338 if options.is_auto:
339 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm|\.htaccess)$')
341 if options.is_all:
342 tests.append(LanguageIC())
343 tests.append(Entropy())
344 tests.append(LongestWord())
345 tests.append(SignatureNasty())
346 else:
347 if options.is_entropy:
348 tests.append(Entropy())
349 if options.is_longest:
350 tests.append(LongestWord())
351 if options.is_ic:
352 tests.append(LanguageIC())
353 if options.is_signature:
354 tests.append(SignatureNasty())
355 if options.is_zlib:
356 tests.append(Compression())
358 # Instantiate the Generator Class used for searching, opening, and reading files
359 locator = SearchFile()
361 # CSV file output array
362 csv_array = []
363 csv_header = ["filename"]
365 # Grab the file and calculate each test against file
366 fileCount = 0
367 fileIgnoreCount = 0
368 for data, filename in locator.search_file_path(args, valid_regex):
369 if data:
370 # a row array for the CSV
371 csv_row = []
372 csv_row.append(filename)
374 if options.ignore_unicode:
375 asciiHighCount = 0
376 for character in data:
377 if ord(character) > 127:
378 asciiHighCount = asciiHighCount + 1
380 fileAsciiHighRatio = float(asciiHighCount) / float(len(data))
382 if (options.ignore_unicode == False or fileAsciiHighRatio < .1):
383 for test in tests:
384 calculated_value = test.calculate(data, filename)
385 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
386 if len(csv_header) < len(tests) + 1:
387 csv_header.append(test.__class__.__name__)
388 csv_row.append(calculated_value)
389 fileCount = fileCount + 1
390 csv_array.append(csv_row)
391 else:
392 fileIgnoreCount = fileIgnoreCount + 1
394 if options.is_csv:
395 csv_array.insert(0,csv_header)
396 fileOutput = csv.writer(open(options.is_csv, "wb"))
397 fileOutput.writerows(csv_array)
399 timeFinish = time.clock()
401 # Print some stats
402 print "\n[[ Total files scanned: %i ]]" % (fileCount)
403 print "[[ Total files ignored: %i ]]" % (fileIgnoreCount)
404 print "[[ Scan Time: %f seconds ]]" % (timeFinish - timeStart)
406 # Print top rank lists
407 rank_list = {}
408 for test in tests:
409 test.sort()
410 test.printer(10)
411 for file in test.results:
412 rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
414 rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
416 print "\n[[ Top cumulative ranked files ]]"
417 count = 10
418 if (count > len(rank_sorted)): count = len(rank_sorted)
419 for x in range(count):
420 print ' {0:>7} {1}'.format(rank_sorted[x][1], rank_sorted[x][0])