neopi.py

   1 #!/usr/bin/python
   2 # Name: neopi.py
   3 # Description: Utility to scan a file path for encrypted and obfuscated files
   4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
   5 #                Scott Behrens (scott.behrens@neohapsis.com)
   6 #
   7 # Date: 11/4/2010
   8 #
   9 # pep-0008 - Is stupid. TABS FO'EVER!
  10
  11 # Try catch regular expressions/bad path/bad filename/bad regex/
  12
  13 # Library imports
  14 import math
  15 import sys
  16 import os
  17 import re
  18 import csv
  19 import zlib
  20 import time
  21 from collections import defaultdict
  22 from optparse import OptionParser
  23
  24 class LanguageIC:
  25         """Class that calculates a file's Index of Coincidence as
  26         as well as a a subset of files average Index of Coincidence.
  27         """
  28         def __init__(self):
  29                 """Initialize results arrays as well as character counters."""
  30                 self.char_count =  defaultdict(int)
  31                 self.total_char_count = 0
  32                 self.results = []
  33                 self.ic_total_results = ""
  34
  35         def calculate_char_count(self,data):
  36                 """Method to calculate character counts for a particular data file."""
  37                 if not data:
  38                         return 0
  39                 for x in range(256):
  40                         char = chr(x)
  41                         charcount = data.count(char)
  42                         self.char_count[char] += charcount
  43                         self.total_char_count += charcount
  44                 return
  45
  46         def calculate_IC(self):
  47                 """Calculate the Index of Coincidence for the self variables"""
  48                 total = 0
  49                 for val in self.char_count.values():
  50
  51                         if val == 0:
  52                                 continue
  53                         total += val * (val-1)
  54
  55                 try:
  56                         ic_total =        float(total)/(self.total_char_count * (self.total_char_count - 1))
  57                 except:
  58                         ic_total = 0
  59                 self.ic_total_results = ic_total
  60                 return
  61
  62         def calculate(self,data,filename):
  63                 """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
  64                 if not data:
  65                         return 0
  66                 char_count = 0
  67                 total_char_count = 0
  68
  69                 for x in range(256):
  70                         char = chr(x)
  71                         charcount = data.count(char)
  72                         char_count += charcount * (charcount - 1)
  73                         total_char_count += charcount
  74
  75                 ic = float(char_count)/(total_char_count * (total_char_count - 1))
  76                 self.results.append({"filename":filename, "value":ic})
  77                 # Call method to calculate_char_count and append to total_char_count
  78                 self.calculate_char_count(data)
  79                 return ic
  80
  81         def sort(self):
  82                 self.results.sort(key=lambda item: item["value"])
  83                 self.results = resultsAddRank(self.results)
  84
  85         def printer(self, count):
  86                 """Print the top signature count match files for a given search"""
  87                 # Calculate the Total IC for a Search
  88                 self.calculate_IC()
  89                 print "\n[[ Average IC for Search ]]"
  90                 print self.ic_total_results
  91                 print "\n[[ Top %i lowest IC files ]]" % (count)
  92                 if (count > len(self.results)): count = len(self.results)
  93                 for x in range(count):
  94                         print ' {0:>7.4f}               {1}'.format(self.results[x]["value"], self.results[x]["filename"])
  95                 return
  96
  97 class Entropy:
  98         """Class that calculates a file's Entropy."""
  99
 100         def __init__(self):
 101                 """Instantiate the entropy_results array."""
 102                 self.results = []
 103
 104         def calculate(self,data,filename):
 105                 """Calculate the entropy for 'data' and append result to entropy_results array."""
 106
 107                 if not data:
 108                         return 0
 109                 entropy = 0
 110                 for x in range(256):
 111                         p_x = float(data.count(chr(x)))/len(data)
 112                         if p_x > 0:
 113                                 entropy += - p_x * math.log(p_x, 2)
 114                 self.results.append({"filename":filename, "value":entropy})
 115                 return entropy
 116
 117         def sort(self):
 118                 self.results.sort(key=lambda item: item["value"])
 119                 self.results.reverse()
 120                 self.results = resultsAddRank(self.results)
 121
 122         def printer(self, count):
 123                 """Print the top signature count match files for a given search"""
 124                 print "\n[[ Top %i entropic files for a given search ]]" % (count)
 125                 if (count > len(self.results)): count = len(self.results)
 126                 for x in range(count):
 127                         print ' {0:>7.4f}               {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 128                 return
 129
 130 class LongestWord:
 131         """Class that determines the longest word for a particular file."""
 132         def __init__(self):
 133                 """Instantiate the longestword_results array."""
 134                 self.results = []
 135
 136         def calculate(self,data,filename):
 137                 """Find the longest word in a string and append to longestword_results array"""
 138                 if not data:
 139                         return "", 0
 140                 longest = 0
 141                 longest_word = ""
 142                 words = re.split("[\s,\n,\r]", data)
 143                 if words:
 144                         for word in words:
 145                                 length = len(word)
 146                                 if length > longest:
 147                                         longest = length
 148                                         longest_word = word
 149                 self.results.append({"filename":filename, "value":longest})
 150                 return longest
 151
 152         def sort(self):
 153                 self.results.sort(key=lambda item: item["value"])
 154                 self.results.reverse()
 155                 self.results = resultsAddRank(self.results)
 156
 157         def printer(self, count):
 158                 """Print the top signature count match files for a given search"""
 159                 print "\n[[ Top %i longest word files ]]" % (count)
 160                 if (count > len(self.results)): count = len(self.results)
 161                 for x in range(count):
 162                         print ' {0:>7}          {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 163                 return
 164
 165 class SignatureNasty:
 166         """Generator that searches a given file for nasty expressions"""
 167
 168         def __init__(self):
 169                 """Instantiate the longestword_results array."""
 170                 self.results = []
 171
 172         def calculate(self, data, filename):
 173                 if not data:
 174                         return "", 0
 175                 # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
 176                 valid_regex = re.compile('(eval\(|base64_decode|python_eval|exec\(|passthru|popen|proc_open|pcntl|assert\(|system\(|shell)', re.I)
 177                 matches = re.findall(valid_regex, data)
 178                 self.results.append({"filename":filename, "value":len(matches)})
 179                 return len(matches)
 180
 181         def sort(self):
 182                 self.results.sort(key=lambda item: item["value"])
 183                 self.results.reverse()
 184                 self.results = resultsAddRank(self.results)
 185
 186         def printer(self, count):
 187                 """Print the top signature count match files for a given search"""
 188                 print "\n[[ Top %i signature match counts ]]" % (count)
 189                 if (count > len(self.results)): count = len(self.results)
 190                 for x in range(count):
 191                         print ' {0:>7}          {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 192                 return
 193
 194 class Compression:
 195         """Generator finds compression ratio"""
 196
 197         def __init__(self):
 198                 """Instantiate the results array."""
 199                 self.results = []
 200
 201         def calculate(self, data, filename):
 202                 if not data:
 203                         return "", 0
 204                 compressed = zlib.compress(data)
 205                 ratio = float(len(compressed)) / float(len(data))
 206                 self.results.append({"filename":filename, "value":ratio})
 207                 return ratio
 208
 209         def sort(self):
 210                 self.results.sort(key=lambda item: item["value"])
 211                 self.results.reverse()
 212                 self.results = resultsAddRank(self.results)
 213
 214         def printer(self, count):
 215                 """Print the top files for a given search"""
 216                 print "\n[[ Top %i compression match counts ]]" % (count)
 217                 if (count > len(self.results)): count = len(self.results)
 218                 for x in range(count):
 219                         print ' {0:>7.4f}               {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 220                 return
 221
 222 def resultsAddRank(results):
 223         rank = 1
 224         offset = 1
 225         previousValue = False
 226         newList = []
 227         for file in results:
 228                 if (previousValue and previousValue != file["value"]):
 229                         rank = offset
 230                 file["rank"] = rank
 231                 newList.append(file)
 232                 previousValue = file["value"]
 233                 offset = offset + 1
 234         return newList
 235
 236 class SearchFile:
 237         """Generator that searches a given filepath with an optional regular
 238         expression and returns the filepath and filename"""
 239         def search_file_path(self, args, valid_regex):
 240                 for root, dirs, files in os.walk(args[0]):
 241                         for file in files:
 242                                 filename = os.path.join(root, file)
 243                                 if (valid_regex.search(file) and os.path.getsize(filename) > 60):
 244                                         try:
 245                                                 data = open(root + "/" + file, 'rb').read()
 246                                         except:
 247                                                 data = False
 248                                                 print "Could not read file :: %s/%s" % (root, file)
 249                                         yield data, filename
 250
 251 if __name__ == "__main__":
 252         """Parse all the options"""
 253
 254         timeStart = time.clock()
 255
 256         print """
 257             )         (   (
 258          ( /(         )\ ))\ )
 259          )\())  (    (()/(()/(
 260         ((_)\  ))\ (  /(_))(_))
 261          _((_)/((_))\(_))(_))
 262         | \| (_)) ((_) _ \_ _|
 263         | .` / -_) _ \  _/| |
 264         |_|\_\___\___/_| |___| Ver. *.USEGIT
 265         """
 266
 267         parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
 268                                                   version="%prog 1.0")
 269         parser.add_option("-c", "--csv",
 270                                           action="store",
 271                                           dest="is_csv",
 272                                           default=False,
 273                                           help="generate CSV outfile",
 274                                           metavar="FILECSV")
 275         parser.add_option("-a", "--all",
 276                                           action="store_true",
 277                                           dest="is_all",
 278                                           default=False,
 279                                           help="Run all (useful) tests [Entropy, Longest Word, IC, Signature]",)
 280         parser.add_option("-z", "--zlib",
 281                                           action="store_true",
 282                                           dest="is_zlib",
 283                                           default=False,
 284                                           help="Run compression Test",)
 285         parser.add_option("-e", "--entropy",
 286                                           action="store_true",
 287                                           dest="is_entropy",
 288                                           default=False,
 289                                           help="Run entropy Test",)
 290         parser.add_option("-l", "--longestword",
 291                                           action="store_true",
 292                                           dest="is_longest",
 293                                           default=False,
 294                                           help="Run longest word test",)
 295         parser.add_option("-i", "--ic",
 296                                           action="store_true",
 297                                           dest="is_ic",
 298                                           default=False,
 299                                           help="Run IC test",)
 300         parser.add_option("-s", "--signature",
 301                                           action="store_true",
 302                                           dest="is_signature",
 303                                           default=False,
 304                                           help="Run signature test",)
 305         parser.add_option("-A", "--auto",
 306                                           action="store_true",
 307                                           dest="is_auto",
 308                                           default=False,
 309                                           help="Run auto file extension tests",)
 310         parser.add_option("-u", "--unicode",
 311                                           action="store_true",
 312                                           dest="ignore_unicode",
 313                                           default=False,
 314                                           help="Skip over unicode-y/UTF'y files",)
 315
 316         (options, args) = parser.parse_args()
 317
 318         # Error on invalid number of arguements
 319         if len(args) < 1:
 320                 parser.print_help()
 321                 print ""
 322                 sys.exit()
 323
 324         # Error on an invalid path
 325         if os.path.exists(args[0]) == False:
 326                 parser.error("Invalid path")
 327
 328         valid_regex = ""
 329         if (len(args) == 2 and options.is_auto is False):
 330                 try:
 331                         valid_regex = re.compile(args[1])
 332                 except:
 333                         parser.error("Invalid regular expression")
 334         else:
 335                 valid_regex = re.compile('.*')
 336         tests = []
 337
 338         if options.is_auto:
 339                 valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm|\.htaccess)$')
 340
 341         if options.is_all:
 342                 tests.append(LanguageIC())
 343                 tests.append(Entropy())
 344                 tests.append(LongestWord())
 345                 tests.append(SignatureNasty())
 346         else:
 347                 if options.is_entropy:
 348                         tests.append(Entropy())
 349                 if options.is_longest:
 350                         tests.append(LongestWord())
 351                 if options.is_ic:
 352                         tests.append(LanguageIC())
 353                 if options.is_signature:
 354                         tests.append(SignatureNasty())
 355                 if options.is_zlib:
 356                         tests.append(Compression())
 357
 358         # Instantiate the Generator Class used for searching, opening, and reading files
 359         locator = SearchFile()
 360
 361         # CSV file output array
 362         csv_array = []
 363         csv_header = ["filename"]
 364
 365         # Grab the file and calculate each test against file
 366         fileCount = 0
 367         fileIgnoreCount = 0
 368         for data, filename in locator.search_file_path(args, valid_regex):
 369                 if data:
 370                         # a row array for the CSV
 371                         csv_row = []
 372                         csv_row.append(filename)
 373
 374                         if options.ignore_unicode:
 375                                 asciiHighCount = 0
 376                                 for character in data:
 377                                         if ord(character) > 127:
 378                                                 asciiHighCount = asciiHighCount + 1
 379
 380                                 fileAsciiHighRatio = float(asciiHighCount) / float(len(data))
 381
 382                         if (options.ignore_unicode == False or fileAsciiHighRatio < .1):
 383                                 for test in tests:
 384                                         calculated_value = test.calculate(data, filename)
 385                                         # Make the header row if it hasn't been fully populated, +1 here to account for filename column
 386                                         if len(csv_header) < len(tests) + 1:
 387                                                 csv_header.append(test.__class__.__name__)
 388                                                 csv_row.append(calculated_value)
 389                                         fileCount = fileCount + 1
 390                                         csv_array.append(csv_row)
 391                         else:
 392                                 fileIgnoreCount = fileIgnoreCount + 1
 393
 394         if options.is_csv:
 395                 csv_array.insert(0,csv_header)
 396                 fileOutput = csv.writer(open(options.is_csv, "wb"))
 397                 fileOutput.writerows(csv_array)
 398
 399         timeFinish = time.clock()
 400
 401         # Print some stats
 402         print "\n[[ Total files scanned: %i ]]" % (fileCount)
 403         print "[[ Total files ignored: %i ]]" % (fileIgnoreCount)
 404         print "[[ Scan Time: %f seconds ]]" % (timeFinish - timeStart)
 405
 406         # Print top rank lists
 407         rank_list = {}
 408         for test in tests:
 409                 test.sort()
 410                 test.printer(10)
 411                 for file in test.results:
 412                         rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
 413
 414         rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
 415
 416         print "\n[[ Top cumulative ranked files ]]"
 417         count = 10
 418         if (count > len(rank_sorted)): count = len(rank_sorted)
 419         for x in range(count):
 420                 print ' {0:>7}          {1}'.format(rank_sorted[x][1], rank_sorted[x][0])
 421