neopi.py

   1 #!/usr/bin/python
   2 # Name: neopi.py
   3 # Description: Utility to scan a file path for encrypted and obfuscated files
   4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
   5 #               Scott Behrens (scott.behrens@neohapsis.com)
   6 #
   7 # Date: 11/4/2010
   8 #
   9
  10 # Try catch regular expressions/bad path/bad filename/bad regex/
  11
  12 # Library imports
  13 import math
  14 import sys
  15 import os
  16 import re
  17 import zlib
  18 import csv
  19 from collections import defaultdict
  20 from optparse import OptionParser
  21
  22 class LanguageIC:
  23         """Class that calculates a file's Index of Coincidence as
  24         as well as a a subset of files average Index of Coincidence.
  25         """
  26         def __init__(self):
  27                 """Initialize results arrays as well as character counters."""
  28                 self.char_count =  defaultdict(int)
  29                 self.total_char_count = 0
  30                 self.ic_results = []
  31                 self.ic_total_results = ""
  32
  33         def caculate_char_count(self,data):
  34                 """Method to calculate character counts for a particular data file."""
  35                 if not data:
  36                         return 0
  37
  38                 for x in range(256):
  39                         char = chr(x)
  40                         charcount = data.count(char)
  41                         self.char_count[char] += charcount
  42                         self.total_char_count += charcount
  43
  44                 return
  45
  46         def caculate_IC(self):
  47                 """Calculate the Index of Coincidence for the self variables"""
  48                 total = 0
  49                 for val in self.char_count.values():
  50
  51                         if val == 0:
  52                                 continue
  53                         total += val * (val-1)
  54
  55                 try:
  56                         ic_total =      float(total)/(self.total_char_count * (self.total_char_count - 1))
  57                 except:
  58                         ic_total = 0
  59                 self.ic_total_results = ic_total
  60                 return
  61
  62         def caculate(self,data,filename):
  63                 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
  64                 if not data:
  65                         return 0
  66                 char_count = 0
  67                 total_char_count = 0
  68
  69                 for x in range(256):
  70                         char = chr(x)
  71                         charcount = data.count(char)
  72                         char_count += charcount * (charcount - 1)
  73                         total_char_count += charcount
  74
  75                 ic = float(char_count)/(total_char_count * (total_char_count - 1))
  76                 self.ic_results.append({"filename":filename, "IC":ic})
  77                 # Call method to caculate_char_count and append to total_char_count
  78                 self.caculate_char_count(data)
  79                 return ic
  80
  81         def printer(self):
  82                 """Print the average IC for searchpath and the top 10 lowest Index of Coincidence files."""
  83                 self.ic_results.sort(key=lambda item: item["IC"])
  84                 top_ten = self.ic_results[0:10]
  85                 # Calculate the Total IC for a Search
  86                 self.caculate_IC()
  87                 ic_list = []
  88                 print ""
  89                 print "[[ Average IC for Search ]]"
  90                 print self.ic_total_results
  91                 print ""
  92                 print "[[ Top 10 IC files ]]"
  93                 x = 9
  94                 for file in top_ten:
  95                         print ' {0:>7.4f}        {1}'.format(file["IC"], file["filename"])
  96                         results = file["filename"], x
  97                         ic_list.append(results)
  98                         x = x - 1
  99                 return ic_list
 100
 101 class Entropy:
 102         """Class that calculates a file's Entropy."""
 103
 104         def __init__(self):
 105                 """Instantiate the entropy_results array."""
 106                 self.entropy_results = []
 107
 108         def caculate(self,data,filename):
 109                 """Calculate the entropy for 'data' and append result to entropy_results array."""
 110
 111                 if not data:
 112                         return 0
 113                 entropy = 0
 114                 for x in range(256):
 115                         p_x = float(data.count(chr(x)))/len(data)
 116                         if p_x > 0:
 117                                 entropy += - p_x * math.log(p_x, 2)
 118                 self.entropy_results.append({"filename":filename, "entropy":entropy})
 119                 return entropy
 120
 121         def printer(self):
 122                 """Print the top 10 entropic files for a given search"""
 123                 self.entropy_results.sort(key=lambda item: item["entropy"])
 124                 top_ten = self.entropy_results[-10:]
 125                 top_ten.reverse()
 126                 entropy_list = []
 127
 128                 print ""
 129                 print "[[ Top 10 entropic files ]]"
 130                 x = 9
 131                 for file in top_ten:
 132                         print ' {0:>7.4f}        {1}'.format(file["entropy"], file["filename"])
 133                         results = file["filename"], x
 134                         entropy_list.append(results)
 135                         x = x - 1
 136                 return entropy_list
 137
 138 class LongestWord:
 139         """Class that determines the longest word for a particular file."""
 140         def __init__(self):
 141                 """Instantiate the longestword_results array."""
 142                 self.longestword_results = []
 143
 144         def caculate(self,data,filename):
 145                 """Find the longest word in a string and append to longestword_results array"""
 146
 147                 if not data:
 148                         return "", 0
 149
 150                 longest = 0
 151                 longest_word = ""
 152                 words = re.split("[\s,\n,\r]", data)
 153                 if words:
 154                         for word in words:
 155                                 length = len(word)
 156                                 if length > longest:
 157                                         longest = length
 158                                         longest_word = word
 159                 self.longestword_results.append({"filename":filename, "wordlongest":longest})
 160                 return longest
 161
 162         def printer(self):
 163                 """Print the top 10 longest word files for a given search"""
 164                 self.longestword_results.sort(key=lambda item: item["wordlongest"])
 165                 top_ten = self.longestword_results[-10:]
 166                 top_ten.reverse()
 167                 longestword_list = []
 168
 169                 print ""
 170                 print "[[ Top 10 longest word files ]]"
 171                 x = 9
 172                 for file in top_ten:
 173                         print ' {0:>7}    {1}'.format(file["wordlongest"], file["filename"])
 174                         results = file["filename"], x
 175                         longestword_list.append(results)
 176                         x = x - 1
 177                 return longestword_list
 178
 179 class SearchFile:
 180         """Generator that searches a given filepath with an optional regular
 181         expression and returns the filepath and filename"""
 182         def search_file_path(self, args, valid_regex):
 183                 for root, dirs, files in os.walk(args[0]):
 184                         for file in files:
 185                                 filename = os.path.join(root, file)
 186                                 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
 187                                         try:
 188                                                 data = open(root + "/" + file, 'rb').read()
 189                                         except:
 190                                                 data = False
 191                                                 print "Could not read file :: %s/%s" % (root, file)
 192                                         yield data, filename
 193 class PrintRank:
 194         """bob"""
 195         def print_rank(self, top_ten):
 196
 197                 files = defaultdict(int)
 198                 for list in top_ten:
 199                         for file, rank in list:
 200                                 files[str(file)] += int(rank)
 201
 202                 sorted_top_ten =  sorted(files.items(), key=lambda k: k[1], reverse=True)
 203                 top_ten = sorted_top_ten[0:10]
 204                 print "[[ Highest Rank Files Based on test results ]]"
 205                 # print ' {0:>7}        {1}'.format("Rank", "Filename")
 206
 207                 for file in top_ten:
 208                         #print file[0], "%" +
 209                         print ' {0:>7}    {1}'.format(str(int((float(file[1])/30) * 100)) + "%", file[0])
 210
 211                 return
 212
 213 if __name__ == "__main__":
 214         """Parse all the options"""
 215         parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
 216                                                   version="%prog 1.0")
 217         parser.add_option("-C", "--csv",
 218                                           action="store",
 219                                           dest="is_csv",
 220                                           default=False,
 221                                           help="generate CSV outfile",
 222                                           metavar="FILECSV")
 223         parser.add_option("-a", "--all",
 224                                           action="store_true",
 225                                           dest="is_all",
 226                                           default=False,
 227                                           help="Run all tests [Entropy, Longest Word, Compression]",)
 228         parser.add_option("-e", "--entropy",
 229                                           action="store_true",
 230                                           dest="is_entropy",
 231                                           default=False,
 232                                           help="Run entropy Test",)
 233         parser.add_option("-l", "--longestword",
 234                                           action="store_true",
 235                                           dest="is_longest",
 236                                           default=False,
 237                                           help="Run longest word test",)
 238         parser.add_option("-c", "--ic",
 239                                           action="store_true",
 240                                           dest="is_ic",
 241                                           default=False,
 242                                           help="Run IC test",)
 243         parser.add_option("-A", "--auto",
 244                                           action="store_true",
 245                                           dest="is_auto",
 246                                           default=False,
 247                                           help="Run auto file extension tests",)
 248
 249         (options, args) = parser.parse_args()
 250
 251         # Error on invalid number of arguements
 252         if len(args) < 1:
 253                 parser.error("wrong number of arguments")
 254
 255         # Error on an invalid path
 256         if os.path.exists(args[0]) == False:
 257                 parser.error("Invalid path")
 258
 259         valid_regex = ""
 260         if (len(args) == 2 and options.is_auto is False):
 261                 try:
 262                         valid_regex = re.compile(args[1])
 263                 except:
 264                         parser.error("Invalid regular expression")
 265         else:
 266                 valid_regex = re.compile('.*')
 267         tests = []
 268
 269         if options.is_auto:
 270                 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.sh|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm)$')
 271
 272         if options.is_all:
 273                 tests.append(LanguageIC())
 274                 tests.append(Entropy())
 275                 tests.append(LongestWord())
 276         else:
 277                 if options.is_entropy:
 278                         tests.append(Entropy())
 279
 280                 if options.is_longest:
 281                         tests.append(LongestWord())
 282
 283                 if options.is_ic:
 284                         tests.append(LanguageIC())
 285
 286         # Instantiate the Generator Class used for searching, opening, and reading files
 287         locator = SearchFile()
 288
 289         # CSV file output array
 290         csv_array = []
 291         csv_header = ["filename"]
 292
 293         # Grab the file and calculate each test against file
 294         for data,filename in locator.search_file_path(args, valid_regex):
 295                 if data:
 296                         # a row array for the CSV
 297                         csv_row = []
 298                         csv_row.append(filename)
 299                         for test in tests:
 300                                 calculated_value = test.caculate(data,filename)
 301                                 # Make the header row if it hasn't been fully populated, +1 here to account for filename column
 302                                 if len(csv_header) < len(tests) + 1:
 303                                         csv_header.append(test.__class__.__name__)
 304                                 csv_row.append(calculated_value)
 305                         csv_array.append(csv_row)
 306
 307         if options.is_csv:
 308                 csv_array.insert(0,csv_header)
 309                 fileOutput = csv.writer(open(options.is_csv, "wb"))
 310                 fileOutput.writerows(csv_array)
 311
 312         top_ten = []
 313         # For each test print the top ten results for that test.
 314         for test in tests:
 315                 top_ten.append(test.printer())
 316         print ""
 317
 318         printer = PrintRank()
 319
 320         printer.print_rank(top_ten)
 321