neopi.py

   1 #!/usr/bin/python
   2 # Name: neopi.py
   3 # Description: Utility to scan a file path for encrypted and obfuscated files
   4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
   5 #                Scott Behrens (scott.behrens@neohapsis.com)
   6 #
   7 # Date: 11/4/2010
   8 #
   9 # pep-0008 - Is stupid. TABS FO'EVER!
  10
  11 # Try catch regular expressions/bad path/bad filename/bad regex/
  12
  13 # Library imports
  14 import math
  15 import sys
  16 import os
  17 import re
  18 import csv
  19 from collections import defaultdict
  20 from optparse import OptionParser
  21
  22 class LanguageIC:
  23         """Class that calculates a file's Index of Coincidence as
  24         as well as a a subset of files average Index of Coincidence.
  25         """
  26         def __init__(self):
  27                 """Initialize results arrays as well as character counters."""
  28                 self.char_count =  defaultdict(int)
  29                 self.total_char_count = 0
  30                 self.results = []
  31                 self.ic_total_results = ""
  32
  33         def calculate_char_count(self,data):
  34                 """Method to calculate character counts for a particular data file."""
  35                 if not data:
  36                         return 0
  37
  38                 for x in range(256):
  39                         char = chr(x)
  40                         charcount = data.count(char)
  41                         self.char_count[char] += charcount
  42                         self.total_char_count += charcount
  43
  44                 return
  45
  46         def calculate_IC(self):
  47                 """Calculate the Index of Coincidence for the self variables"""
  48                 total = 0
  49                 for val in self.char_count.values():
  50
  51                         if val == 0:
  52                                 continue
  53                         total += val * (val-1)
  54
  55                 try:
  56                         ic_total =        float(total)/(self.total_char_count * (self.total_char_count - 1))
  57                 except:
  58                         ic_total = 0
  59                 self.ic_total_results = ic_total
  60                 return
  61
  62         def calculate(self,data,filename):
  63                 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
  64                 if not data:
  65                         return 0
  66                 char_count = 0
  67                 total_char_count = 0
  68
  69                 for x in range(256):
  70                         char = chr(x)
  71                         charcount = data.count(char)
  72                         char_count += charcount * (charcount - 1)
  73                         total_char_count += charcount
  74
  75                 ic = float(char_count)/(total_char_count * (total_char_count - 1))
  76                 self.results.append({"filename":filename, "value":ic})
  77                 # Call method to calculate_char_count and append to total_char_count
  78                 self.calculate_char_count(data)
  79                 return ic
  80
  81         def sort(self):
  82                 self.results.sort(key=lambda item: item["value"])
  83                 self.results = resultsAddRank(self.results)
  84
  85         def printer(self, count):
  86                 """Print the top signature count match files for a given search"""
  87                 # Calculate the Total IC for a Search
  88                 self.calculate_IC()
  89                 print "\n[[ Average IC for Search ]]"
  90                 print self.ic_total_results
  91                 print "\n[[ Top %i lowest IC files ]]" % (count)
  92                 for x in range(count):
  93                         print ' {0:>7.4f}               {1}'.format(self.results[x]["value"], self.results[x]["filename"])
  94                 return
  95
  96 class Entropy:
  97         """Class that calculates a file's Entropy."""
  98
  99         def __init__(self):
 100                 """Instantiate the entropy_results array."""
 101                 self.results = []
 102
 103         def calculate(self,data,filename):
 104                 """Calculate the entropy for 'data' and append result to entropy_results array."""
 105
 106                 if not data:
 107                         return 0
 108                 entropy = 0
 109                 for x in range(256):
 110                         p_x = float(data.count(chr(x)))/len(data)
 111                         if p_x > 0:
 112                                 entropy += - p_x * math.log(p_x, 2)
 113                 self.results.append({"filename":filename, "value":entropy})
 114                 return entropy
 115
 116         def sort(self):
 117                 self.results.sort(key=lambda item: item["value"])
 118                 self.results.reverse()
 119                 self.results = resultsAddRank(self.results)
 120
 121         def printer(self, count):
 122                 """Print the top signature count match files for a given search"""
 123                 print "\n[[ Top %i entropic files for a given search ]]" % (count)
 124                 for x in range(count):
 125                         print ' {0:>7.4f}               {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 126                 return
 127
 128 class LongestWord:
 129         """Class that determines the longest word for a particular file."""
 130         def __init__(self):
 131                 """Instantiate the longestword_results array."""
 132                 self.results = []
 133
 134         def calculate(self,data,filename):
 135                 """Find the longest word in a string and append to longestword_results array"""
 136                 if not data:
 137                         return "", 0
 138                 longest = 0
 139                 longest_word = ""
 140                 words = re.split("[\s,\n,\r]", data)
 141                 if words:
 142                         for word in words:
 143                                 length = len(word)
 144                                 if length > longest:
 145                                         longest = length
 146                                         longest_word = word
 147                 self.results.append({"filename":filename, "value":longest})
 148                 return longest
 149
 150         def sort(self):
 151                 self.results.sort(key=lambda item: item["value"])
 152                 self.results.reverse()
 153                 self.results = resultsAddRank(self.results)
 154
 155         def printer(self, count):
 156                 """Print the top signature count match files for a given search"""
 157                 print "\n[[ Top %i longest word files ]]" % (count)
 158                 for x in range(count):
 159                         print ' {0:>7}          {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 160                 return
 161
 162 class SignatureNasty:
 163         """Generator that searches a given file for nasty expressions"""
 164
 165         def __init__(self):
 166                 """Instantiate the longestword_results array."""
 167                 self.results = []
 168
 169         def calculate(self, data, filename):
 170                 if not data:
 171                         return "", 0
 172                 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
 173                 valid_regex = re.compile('(eval\(|base64_decode|python_eval|exec\(|passthru\(|popen\(|proc_open\(|pcntl_|assert\()')
 174                 matches = re.findall(valid_regex, data)
 175                 self.results.append({"filename":filename, "value":len(matches)})
 176                 return len(matches)
 177
 178         def sort(self):
 179                 self.results.sort(key=lambda item: item["value"])
 180                 self.results.reverse()
 181                 self.results = resultsAddRank(self.results)
 182
 183         def printer(self, count):
 184                 """Print the top signature count match files for a given search"""
 185                 print "\n[[ Top %i signature match counts ]]" % (count)
 186                 for x in range(count):
 187                         print ' {0:>7}          {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 188                 return
 189
 190 def resultsAddRank(results):
 191         rank = 1
 192         offset = 1
 193         previousValue = False
 194         newList = []
 195         for file in results:
 196                 if (previousValue and previousValue != file["value"]):
 197                         rank = offset
 198                 file["rank"] = rank
 199                 newList.append(file)
 200                 previousValue = file["value"]
 201                 offset = offset + 1
 202         return newList
 203
 204 class SearchFile:
 205         """Generator that searches a given filepath with an optional regular
 206         expression and returns the filepath and filename"""
 207         def search_file_path(self, args, valid_regex):
 208                 for root, dirs, files in os.walk(args[0]):
 209                         for file in files:
 210                                 filename = os.path.join(root, file)
 211                                 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
 212                                         try:
 213                                                 data = open(root + "/" + file, 'rb').read()
 214                                         except:
 215                                                 data = False
 216                                                 print "Could not read file :: %s/%s" % (root, file)
 217                                         yield data, filename
 218
 219 if __name__ == "__main__":
 220         """Parse all the options"""
 221         parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
 222                                                   version="%prog 1.0")
 223         parser.add_option("-c", "--csv",
 224                                           action="store",
 225                                           dest="is_csv",
 226                                           default=False,
 227                                           help="generate CSV outfile",
 228                                           metavar="FILECSV")
 229         parser.add_option("-a", "--all",
 230                                           action="store_true",
 231                                           dest="is_all",
 232                                           default=False,
 233                                           help="Run all tests [Entropy, Longest Word, IC, Signature]",)
 234         parser.add_option("-e", "--entropy",
 235                                           action="store_true",
 236                                           dest="is_entropy",
 237                                           default=False,
 238                                           help="Run entropy Test",)
 239         parser.add_option("-l", "--longestword",
 240                                           action="store_true",
 241                                           dest="is_longest",
 242                                           default=False,
 243                                           help="Run longest word test",)
 244         parser.add_option("-i", "--ic",
 245                                           action="store_true",
 246                                           dest="is_ic",
 247                                           default=False,
 248                                           help="Run IC test",)
 249         parser.add_option("-s", "--signature",
 250                                           action="store_true",
 251                                           dest="is_signature",
 252                                           default=False,
 253                                           help="Run signature test",)
 254         parser.add_option("-A", "--auto",
 255                                           action="store_true",
 256                                           dest="is_auto",
 257                                           default=False,
 258                                           help="Run auto file extension tests",)
 259
 260         (options, args) = parser.parse_args()
 261
 262         # Error on invalid number of arguements
 263         if len(args) < 1:
 264                 parser.error("Wrong number of arguments")
 265
 266         # Error on an invalid path
 267         if os.path.exists(args[0]) == False:
 268                 parser.error("Invalid path")
 269
 270         valid_regex = ""
 271         if (len(args) == 2 and options.is_auto is False):
 272                 try:
 273                         valid_regex = re.compile(args[1])
 274                 except:
 275                         parser.error("Invalid regular expression")
 276         else:
 277                 valid_regex = re.compile('.*')
 278         tests = []
 279
 280         if options.is_auto:
 281                 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm)$')
 282
 283         if options.is_all:
 284                 tests.append(LanguageIC())
 285                 tests.append(Entropy())
 286                 tests.append(LongestWord())
 287                 tests.append(SignatureNasty())
 288         else:
 289                 if options.is_entropy:
 290                         tests.append(Entropy())
 291                 if options.is_longest:
 292                         tests.append(LongestWord())
 293                 if options.is_ic:
 294                         tests.append(LanguageIC())
 295                 if options.is_signature:
 296                         tests.append(SignatureNasty())
 297
 298         # Instantiate the Generator Class used for searching, opening, and reading files
 299         locator = SearchFile()
 300
 301         # CSV file output array
 302         csv_array = []
 303         csv_header = ["filename"]
 304
 305         # Grab the file and calculate each test against file
 306         for data, filename in locator.search_file_path(args, valid_regex):
 307                 if data:
 308                         # a row array for the CSV
 309                         csv_row = []
 310                         csv_row.append(filename)
 311                         for test in tests:
 312                                 calculated_value = test.calculate(data, filename)
 313                                 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
 314                                 if len(csv_header) < len(tests) + 1:
 315                                         csv_header.append(test.__class__.__name__)
 316                                 csv_row.append(calculated_value)
 317                         csv_array.append(csv_row)
 318
 319         if options.is_csv:
 320                 csv_array.insert(0,csv_header)
 321                 fileOutput = csv.writer(open(options.is_csv, "wb"))
 322                 fileOutput.writerows(csv_array)
 323
 324         # Print top rank lists
 325         rank_list = {}
 326         for test in tests:
 327                 test.sort()
 328                 test.printer(10)
 329                 for file in test.results:
 330                         rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
 331
 332         rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
 333
 334         print "\n[[ Top cumulative ranked files ]]"
 335         for x in range(10):
 336                 print ' {0:>7}          {1}'.format(rank_sorted[x][1], rank_sorted[x][0])
 337