neopi.py

   1 #!/usr/bin/python
   2 # Name: neopi.py
   3 # Description: Utility to scan a file path for encrypted and obfuscated files
   4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
   5 #                  Scott Behrens (scott.behrens@neohapsis.com)
   6 #
   7 # Date: 11/4/2010
   8 # Copyright: Neohapsis Open Source blah Blah
   9 #
  10
  11 # Try catch regular expressions/bad path/bad filename/bad regex/
  12
  13 # Library imports
  14 import math
  15 import sys
  16 import os
  17 import re
  18 import zlib
  19 import csv
  20 from collections import defaultdict
  21 from optparse import OptionParser
  22
  23 class LanguageIC:
  24         """Class that calculates a file's Index of Coincidence as
  25         as well as a a subset of files average Index of Coincidence.
  26         """
  27         def __init__(self):
  28                 """Initialize results arrays as well as character counters."""
  29                 self.char_count =  defaultdict(int)
  30                 self.total_char_count = 0
  31                 self.ic_results = []
  32                 self.ic_total_results = ""
  33
  34         def caculate_char_count(self,data):
  35                 """Method to calculate character counts for a particular data file."""
  36                 if not data:
  37                         return 0
  38
  39                 for x in range(256):
  40                         char = chr(x)
  41                         charcount = data.count(char)
  42                         self.char_count[char] += charcount
  43                         self.total_char_count += charcount
  44
  45                 return
  46
  47         def caculate_IC(self):
  48                 """Calculate the Index of Coincidence for the self variables"""
  49                 total = 0
  50                 for val in self.char_count.values():
  51
  52                         if val == 0:
  53                                 continue
  54                         total += val * (val-1)
  55
  56                 try:
  57                         ic_total =      float(total)/(self.total_char_count * (self.total_char_count - 1))
  58                 except:
  59                         ic_total = 0
  60                 self.ic_total_results = ic_total
  61                 return
  62
  63         def caculate(self,data,filename):
  64                 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
  65                 if not data:
  66                         return 0
  67                 char_count = 0
  68                 total_char_count = 0
  69
  70                 for x in range(256):
  71                         char = chr(x)
  72                         charcount = data.count(char)
  73                         char_count += charcount * (charcount - 1)
  74                         total_char_count += charcount
  75
  76                 ic = float(char_count)/(total_char_count * (total_char_count - 1))
  77                 self.ic_results.append({"filename":filename, "IC":ic})
  78                 # Call method to caculate_char_count and append to total_char_count
  79                 self.caculate_char_count(data)
  80                 return ic
  81
  82         def printer(self):
  83                 """Print the average IC for searchpath and the top 10 lowest Index of Coincidence files."""
  84                 self.ic_results.sort(key=lambda item: item["IC"])
  85                 top_ten = self.ic_results[0:10]
  86                 # Calculate the Total IC for a Search
  87                 self.caculate_IC()
  88                 ic_list = []
  89                 print ""
  90                 print "[[ Average IC for Search ]]"
  91                 print self.ic_total_results
  92                 print ""
  93                 print "[[ Top 10 IC files ]]"
  94                 x = 9
  95                 for file in top_ten:
  96                         print ' {0:>7.4f}        {1}'.format(file["IC"], file["filename"])
  97                         results = file["filename"], x
  98                         ic_list.append(results)
  99                         x = x - 1
 100                 return ic_list
 101
 102 class Entropy:
 103         """Class that calculates a file's Entropy."""
 104
 105         def __init__(self):
 106                 """Instantiate the entropy_results array."""
 107                 self.entropy_results = []
 108
 109         def caculate(self,data,filename):
 110                 """Calculate the entropy for 'data' and append result to entropy_results array."""
 111
 112                 if not data:
 113                         return 0
 114                 entropy = 0
 115                 for x in range(256):
 116                         p_x = float(data.count(chr(x)))/len(data)
 117                         if p_x > 0:
 118                                 entropy += - p_x * math.log(p_x, 2)
 119                 self.entropy_results.append({"filename":filename, "entropy":entropy})
 120                 return entropy
 121
 122         def printer(self):
 123                 """Print the top 10 entropic files for a given search"""
 124                 self.entropy_results.sort(key=lambda item: item["entropy"])
 125                 top_ten = self.entropy_results[-10:]
 126                 top_ten.reverse()
 127                 entropy_list = []
 128
 129                 print ""
 130                 print "[[ Top 10 entropic files ]]"
 131                 x = 9
 132                 for file in top_ten:
 133                         print ' {0:>7.4f}        {1}'.format(file["entropy"], file["filename"])
 134                         results = file["filename"], x
 135                         entropy_list.append(results)
 136                         x = x - 1
 137                 return entropy_list
 138
 139 class LongestWord:
 140         """Class that determines the longest word for a particular file."""
 141         def __init__(self):
 142                 """Instantiate the longestword_results array."""
 143                 self.longestword_results = []
 144
 145         def caculate(self,data,filename):
 146                 """Find the longest word in a string and append to longestword_results array"""
 147
 148                 if not data:
 149                         return "", 0
 150
 151                 longest = 0
 152                 longest_word = ""
 153                 words = re.split("[\s,\n,\r]", data)
 154                 if words:
 155                         for word in words:
 156                                 length = len(word)
 157                                 if length > longest:
 158                                         longest = length
 159                                         longest_word = word
 160                 self.longestword_results.append({"filename":filename, "wordlongest":longest})
 161                 return longest
 162
 163         def printer(self):
 164                 """Print the top 10 longest word files for a given search"""
 165                 self.longestword_results.sort(key=lambda item: item["wordlongest"])
 166                 top_ten = self.longestword_results[-10:]
 167                 top_ten.reverse()
 168                 longestword_list = []
 169
 170                 print ""
 171                 print "[[ Top 10 longest word files ]]"
 172                 x = 9
 173                 for file in top_ten:
 174                         print ' {0:>7}    {1}'.format(file["wordlongest"], file["filename"])
 175                         results = file["filename"], x
 176                         longestword_list.append(results)
 177                         x = x - 1
 178                 return longestword_list
 179
 180 class SearchFile:
 181         """Generator that searches a given filepath with an optional regular
 182         expression and returns the filepath and filename"""
 183         def search_file_path(self, args, valid_regex):
 184                 for root, dirs, files in os.walk(args[0]):
 185                         for file in files:
 186                                 filename = os.path.join(root, file)
 187                                 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
 188                                         try:
 189                                                 data = open(root + "/" + file, 'rb').read()
 190                                         except:
 191                                                 data = False
 192                                                 print "Could not read file :: %s/%s" % (root, file)
 193                                         yield data, filename
 194 class PrintRank:
 195         """bob"""
 196         def print_rank(self, top_ten):
 197
 198                 files = defaultdict(int)
 199                 for list in top_ten:
 200                         for file, rank in list:
 201                                 files[str(file)] += int(rank)
 202
 203                 sorted_top_ten =  sorted(files.items(), key=lambda k: k[1], reverse=True)
 204                 top_ten = sorted_top_ten[0:10]
 205                 print "[[ Highest Rank Files Based on test results ]]"
 206                 # print ' {0:>7}        {1}'.format("Rank", "Filename")
 207
 208                 for file in top_ten:
 209                         #print file[0], "%" +
 210                         print ' {0:>7}    {1}'.format(str(int((float(file[1])/30) * 100)) + "%", file[0])
 211
 212                 return
 213
 214 if __name__ == "__main__":
 215         """Parse all the options"""
 216         parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
 217                                                   version="%prog 1.0")
 218         parser.add_option("-C", "--csv",
 219                                           action="store",
 220                                           dest="is_csv",
 221                                           default=False,
 222                                           help="generate CSV outfile",
 223                                           metavar="FILECSV")
 224         parser.add_option("-a", "--all",
 225                                           action="store_true",
 226                                           dest="is_all",
 227                                           default=False,
 228                                           help="Run all tests [Entropy, Longest Word, Compression]",)
 229         parser.add_option("-e", "--entropy",
 230                                           action="store_true",
 231                                           dest="is_entropy",
 232                                           default=False,
 233                                           help="Run entropy Test",)
 234         parser.add_option("-l", "--longestword",
 235                                           action="store_true",
 236                                           dest="is_longest",
 237                                           default=False,
 238                                           help="Run longest word test",)
 239         parser.add_option("-c", "--ic",
 240                                           action="store_true",
 241                                           dest="is_ic",
 242                                           default=False,
 243                                           help="Run IC test",)
 244         parser.add_option("-A", "--auto",
 245                                           action="store_true",
 246                                           dest="is_auto",
 247                                           default=False,
 248                                           help="Run auto file extension tests",)
 249
 250         (options, args) = parser.parse_args()
 251
 252         # Error on invalid number of arguements
 253         if len(args) < 1:
 254                 parser.error("wrong number of arguments")
 255
 256         # Error on an invalid path
 257         if os.path.exists(args[0]) == False:
 258                 parser.error("invalid path")
 259
 260         valid_regex = ""
 261         if (len(args) == 2 and options.is_auto is False):
 262                 valid_regex = re.compile(args[1])
 263         else:
 264                 valid_regex = re.compile('.*')
 265         tests = []
 266
 267         if options.is_auto:
 268                 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.sh|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm)$')
 269
 270         if options.is_all:
 271                 tests.append(LanguageIC())
 272                 tests.append(Entropy())
 273                 tests.append(LongestWord())
 274         else:
 275                 if options.is_entropy:
 276                         tests.append(Entropy())
 277
 278                 if options.is_longest:
 279                         tests.append(LongestWord())
 280
 281                 if options.is_ic:
 282                         tests.append(LanguageIC())
 283
 284         # Instantiate the Generator Class used for searching, opening, and reading files
 285         locator = SearchFile()
 286
 287         # CSV file output array
 288         csv_array = []
 289         csv_header = ["filename"]
 290
 291         # Grab the file and calculate each test against file
 292         for data,filename in locator.search_file_path(args, valid_regex):
 293                 if data:
 294                         # a row array for the CSV
 295                         csv_row = []
 296                         csv_row.append(filename)
 297                         for test in tests:
 298                                 calculated_value = test.caculate(data,filename)
 299                                 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
 300                                 if len(csv_header) < len(tests) + 1:
 301                                         csv_header.append(test.__class__.__name__)
 302                                 csv_row.append(calculated_value)
 303                         csv_array.append(csv_row)
 304
 305         if options.is_csv:
 306                 csv_array.insert(0,csv_header)
 307                 fileOutput = csv.writer(open(options.is_csv, "wb"))
 308                 fileOutput.writerows(csv_array)
 309
 310         top_ten = []
 311         # For each test print the top ten results for that test.
 312         for test in tests:
 313                 top_ten.append(test.printer())
 314         print ""
 315
 316         printer = PrintRank()
 317
 318         printer.print_rank(top_ten)
 319