neopi.py

   1 #!/usr/bin/python
   2 # Name: neopi.py
   3 # Description: Utility to scan a file path for encrypted and obfuscated files
   4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
   5 #                Scott Behrens (scott.behrens@neohapsis.com)
   6 #
   7 # Date: 11/4/2010
   8 #
   9 # pep-0008 - Is stupid. TABS FO'EVER!
  10
  11 # Try catch regular expressions/bad path/bad filename/bad regex/
  12
  13 # Library imports
  14 import math
  15 import sys
  16 import os
  17 import re
  18 import csv
  19 import zlib
  20 import time
  21 from collections import defaultdict
  22 from optparse import OptionParser
  23
  24 class LanguageIC:
  25        """Class that calculates a file's Index of Coincidence as
  26        as well as a a subset of files average Index of Coincidence.
  27        """
  28        def __init__(self):
  29                """Initialize results arrays as well as character counters."""
  30                self.char_count =  defaultdict(int)
  31                self.total_char_count = 0
  32                self.results = []
  33                self.ic_total_results = ""
  34
  35        def calculate_char_count(self,data):
  36                """Method to calculate character counts for a particular data file."""
  37                if not data:
  38                        return 0
  39                for x in range(256):
  40                        char = chr(x)
  41                        charcount = data.count(char)
  42                        self.char_count[char] += charcount
  43                        self.total_char_count += charcount
  44                return
  45
  46        def calculate_IC(self):
  47                """Calculate the Index of Coincidence for the self variables"""
  48                total = 0
  49                for val in self.char_count.values():
  50
  51                        if val == 0:
  52                                continue
  53                        total += val * (val-1)
  54
  55                try:
  56                        ic_total =        float(total)/(self.total_char_count * (self.total_char_count - 1))
  57                except:
  58                        ic_total = 0
  59                self.ic_total_results = ic_total
  60                return
  61
  62        def calculate(self,data,filename):
  63                """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
  64                if not data:
  65                        return 0
  66                char_count = 0
  67                total_char_count = 0
  68
  69                for x in range(256):
  70                        char = chr(x)
  71                        charcount = data.count(char)
  72                        char_count += charcount * (charcount - 1)
  73                        total_char_count += charcount
  74
  75                ic = float(char_count)/(total_char_count * (total_char_count - 1))
  76                self.results.append({"filename":filename, "value":ic})
  77                # Call method to calculate_char_count and append to total_char_count
  78                self.calculate_char_count(data)
  79                return ic
  80
  81        def sort(self):
  82                self.results.sort(key=lambda item: item["value"])
  83                self.results = resultsAddRank(self.results)
  84
  85        def printer(self, count):
  86                """Print the top signature count match files for a given search"""
  87                # Calculate the Total IC for a Search
  88                self.calculate_IC()
  89                print "\n[[ Average IC for Search ]]"
  90                print self.ic_total_results
  91                print "\n[[ Top %i lowest IC files ]]" % (count)
  92                if (count > len(self.results)): count = len(self.results)
  93                for x in range(count):
  94                        print ' {0:>7.4f}               {1}'.format(self.results[x]["value"], self.results[x]["filename"])
  95                return
  96
  97 class Entropy:
  98        """Class that calculates a file's Entropy."""
  99
 100        def __init__(self):
 101                """Instantiate the entropy_results array."""
 102                self.results = []
 103
 104        def calculate(self,data,filename):
 105                """Calculate the entropy for 'data' and append result to entropy_results array."""
 106
 107                if not data:
 108                        return 0
 109                entropy = 0
 110                self.stripped_data =data.replace(' ', '')
 111                for x in range(256):
 112                        p_x = float(self.stripped_data.count(chr(x)))/len(self.stripped_data)
 113                        if p_x > 0:
 114                                entropy += - p_x * math.log(p_x, 2)
 115                self.results.append({"filename":filename, "value":entropy})
 116                return entropy
 117
 118        def sort(self):
 119                self.results.sort(key=lambda item: item["value"])
 120                self.results.reverse()
 121                self.results = resultsAddRank(self.results)
 122
 123        def printer(self, count):
 124                """Print the top signature count match files for a given search"""
 125                print "\n[[ Top %i entropic files for a given search ]]" % (count)
 126                if (count > len(self.results)): count = len(self.results)
 127                for x in range(count):
 128                        print ' {0:>7.4f}               {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 129                return
 130
 131 class LongestWord:
 132        """Class that determines the longest word for a particular file."""
 133        def __init__(self):
 134                """Instantiate the longestword_results array."""
 135                self.results = []
 136
 137        def calculate(self,data,filename):
 138                """Find the longest word in a string and append to longestword_results array"""
 139                if not data:
 140                        return "", 0
 141                longest = 0
 142                longest_word = ""
 143                words = re.split("[\s,\n,\r]", data)
 144                if words:
 145                        for word in words:
 146                                length = len(word)
 147                                if length > longest:
 148                                        longest = length
 149                                        longest_word = word
 150                self.results.append({"filename":filename, "value":longest})
 151                return longest
 152
 153        def sort(self):
 154                self.results.sort(key=lambda item: item["value"])
 155                self.results.reverse()
 156                self.results = resultsAddRank(self.results)
 157
 158        def printer(self, count):
 159                """Print the top signature count match files for a given search"""
 160                print "\n[[ Top %i longest word files ]]" % (count)
 161                if (count > len(self.results)): count = len(self.results)
 162                for x in range(count):
 163                        print ' {0:>7}          {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 164                return
 165
 166 class SignatureNasty:
 167        """Generator that searches a given file for nasty expressions"""
 168
 169        def __init__(self):
 170                """Instantiate the results array."""
 171                self.results = []
 172
 173        def calculate(self, data, filename):
 174                if not data:
 175                        return "", 0
 176                # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
 177                valid_regex = re.compile('(eval\(|file_put_contents|base64_decode|python_eval|exec\(|passthru|popen|proc_open|pcntl|assert\(|system\(|shell)', re.I)
 178                matches = re.findall(valid_regex, data)
 179                self.results.append({"filename":filename, "value":len(matches)})
 180                return len(matches)
 181
 182        def sort(self):
 183                self.results.sort(key=lambda item: item["value"])
 184                self.results.reverse()
 185                self.results = resultsAddRank(self.results)
 186
 187        def printer(self, count):
 188                """Print the top signature count match files for a given search"""
 189                print "\n[[ Top %i signature match counts ]]" % (count)
 190                if (count > len(self.results)): count = len(self.results)
 191                for x in range(count):
 192                        print ' {0:>7}          {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 193                return
 194
 195
 196 class UsesEval:
 197        """Generator that searches a given file for nasty eval with variable"""
 198
 199        def __init__(self):
 200                """Instantiate the eval_results array."""
 201                self.results = []
 202
 203        def calculate(self, data, filename):
 204                if not data:
 205                        return "", 0
 206                # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
 207                valid_regex = re.compile('(eval\(\$(\w|\d))', re.I)
 208                matches = re.findall(valid_regex, data)
 209                self.results.append({"filename":filename, "value":len(matches)})
 210                return len(matches)
 211
 212        def sort(self):
 213                self.results.sort(key=lambda item: item["value"])
 214                self.results.reverse()
 215                self.results = resultsAddRank(self.results)
 216
 217        def printer(self, count):
 218                """Print the files that use eval"""
 219                print "\n[[ Top %i eval match counts ]]" % (count)
 220                if (count > len(self.results)): count = len(self.results)
 221                for x in range(count):
 222                        print ' {0:>7}          {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 223                return
 224
 225
 226 class Compression:
 227        """Generator finds compression ratio"""
 228
 229        def __init__(self):
 230                """Instantiate the results array."""
 231                self.results = []
 232
 233        def calculate(self, data, filename):
 234                if not data:
 235                        return "", 0
 236                compressed = zlib.compress(data)
 237                ratio = float(len(compressed)) / float(len(data))
 238                self.results.append({"filename":filename, "value":ratio})
 239                return ratio
 240
 241        def sort(self):
 242                self.results.sort(key=lambda item: item["value"])
 243                self.results.reverse()
 244                self.results = resultsAddRank(self.results)
 245
 246        def printer(self, count):
 247                """Print the top files for a given search"""
 248                print "\n[[ Top %i compression match counts ]]" % (count)
 249                if (count > len(self.results)): count = len(self.results)
 250                for x in range(count):
 251                        print ' {0:>7.4f}               {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 252                return
 253
 254 def resultsAddRank(results):
 255        rank = 1
 256        offset = 1
 257        previousValue = False
 258        newList = []
 259        for file in results:
 260                if (previousValue and previousValue != file["value"]):
 261                        rank = offset
 262                file["rank"] = rank
 263                newList.append(file)
 264                previousValue = file["value"]
 265                offset = offset + 1
 266        return newList
 267
 268 class SearchFile:
 269        """Generator that searches a given filepath with an optional regular
 270        expression and returns the filepath and filename"""
 271        def search_file_path(self, args, valid_regex):
 272                for root, dirs, files in os.walk(args[0]):
 273                        for file in files:
 274                                filename = os.path.join(root, file)
 275                                if (valid_regex.search(file) and os.path.getsize(filename) > 60):
 276                                        try:
 277                                                data = open(root + "/" + file, 'rb').read()
 278                                        except:
 279                                                data = False
 280                                                print "Could not read file :: %s/%s" % (root, file)
 281                                        yield data, filename
 282
 283 if __name__ == "__main__":
 284        """Parse all the options"""
 285
 286        timeStart = time.clock()
 287
 288        print """
 289            )         (   (
 290         ( /(         )\ ))\ )
 291         )\())  (    (()/(()/(
 292        ((_)\  ))\ (  /(_))(_))
 293         _((_)/((_))\(_))(_))
 294        | \| (_)) ((_) _ \_ _|
 295        | .` / -_) _ \  _/| |
 296        |_|\_\___\___/_| |___| Ver. *.USEGIT
 297        """
 298
 299        parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
 300                                                  version="%prog 1.0")
 301        parser.add_option("-c", "--csv",
 302                                          action="store",
 303                                          dest="is_csv",
 304                                          default=False,
 305                                          help="generate CSV outfile",
 306                                          metavar="FILECSV")
 307        parser.add_option("-a", "--all",
 308                                          action="store_true",
 309                                          dest="is_all",
 310                                          default=False,
 311                                          help="Run all (useful) tests [Entropy, Longest Word, IC, Signature]",)
 312        parser.add_option("-z", "--zlib",
 313                                          action="store_true",
 314                                          dest="is_zlib",
 315                                          default=False,
 316                                          help="Run compression test",)
 317        parser.add_option("-e", "--entropy",
 318                                          action="store_true",
 319                                          dest="is_entropy",
 320                                          default=False,
 321                                          help="Run entropy test",)
 322        parser.add_option("-E", "--eval",
 323                                          action="store_true",
 324                                          dest="is_eval",
 325                                          default=False,
 326                                          help="Run signiture test for eval function and variable",)
 327        parser.add_option("-l", "--longestword",
 328                                          action="store_true",
 329                                          dest="is_longest",
 330                                          default=False,
 331                                          help="Run longest word test",)
 332        parser.add_option("-i", "--ic",
 333                                          action="store_true",
 334                                          dest="is_ic",
 335                                          default=False,
 336                                          help="Run IC test",)
 337        parser.add_option("-s", "--signature",
 338                                          action="store_true",
 339                                          dest="is_signature",
 340                                          default=False,
 341                                          help="Run signature test",)
 342        parser.add_option("-A", "--auto",
 343                                          action="store_true",
 344                                          dest="is_auto",
 345                                          default=False,
 346                                          help="Run auto file extension tests",)
 347        parser.add_option("-u", "--unicode",
 348                                          action="store_true",
 349                                          dest="ignore_unicode",
 350                                          default=False,
 351                                          help="Skip over unicode-y/UTF'y files",)
 352
 353        (options, args) = parser.parse_args()
 354
 355        # Error on invalid number of arguements
 356        if len(args) < 1:
 357                parser.print_help()
 358                print ""
 359                sys.exit()
 360
 361        # Error on an invalid path
 362        if os.path.exists(args[0]) == False:
 363                parser.error("Invalid path")
 364
 365        valid_regex = ""
 366        if (len(args) == 2 and options.is_auto is False):
 367                try:
 368                        valid_regex = re.compile(args[1])
 369                except:
 370                        parser.error("Invalid regular expression")
 371        else:
 372                valid_regex = re.compile('.*')
 373        tests = []
 374
 375        if options.is_auto:
 376                valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm|\.htaccess)$')
 377
 378        if options.is_all:
 379                tests.append(LanguageIC())
 380                tests.append(Entropy())
 381                tests.append(LongestWord())
 382                tests.append(SignatureNasty())
 383        else:
 384                if options.is_entropy:
 385                        tests.append(Entropy())
 386                if options.is_longest:
 387                        tests.append(LongestWord())
 388                if options.is_ic:
 389                        tests.append(LanguageIC())
 390                if options.is_signature:
 391                        tests.append(SignatureNasty())
 392                if options.is_eval:
 393                        tests.append(UsesEval())
 394                if options.is_zlib:
 395                        tests.append(Compression())
 396
 397        # Instantiate the Generator Class used for searching, opening, and reading files
 398        locator = SearchFile()
 399
 400        # CSV file output array
 401        csv_array = []
 402        csv_header = ["filename"]
 403
 404        # Grab the file and calculate each test against file
 405        fileCount = 0
 406        fileIgnoreCount = 0
 407        for data, filename in locator.search_file_path(args, valid_regex):
 408                if data:
 409                        # a row array for the CSV
 410                        csv_row = []
 411                        csv_row.append(filename)
 412
 413                        if options.ignore_unicode:
 414                                asciiHighCount = 0
 415                                for character in data:
 416                                        if ord(character) > 127:
 417                                                asciiHighCount = asciiHighCount + 1
 418
 419                                fileAsciiHighRatio = float(asciiHighCount) / float(len(data))
 420
 421                        if (options.ignore_unicode == False or fileAsciiHighRatio < .1):
 422                                for test in tests:
 423                                        calculated_value = test.calculate(data, filename)
 424                                        # Make the header row if it hasn't been fully populated, +1 here to account for filename column
 425                                        if len(csv_header) < len(tests) + 1:
 426                                                csv_header.append(test.__class__.__name__)
 427                                                csv_row.append(calculated_value)
 428                                        fileCount = fileCount + 1
 429                                        csv_array.append(csv_row)
 430                        else:
 431                                fileIgnoreCount = fileIgnoreCount + 1
 432
 433        if options.is_csv:
 434                csv_array.insert(0,csv_header)
 435                fileOutput = csv.writer(open(options.is_csv, "wb"))
 436                fileOutput.writerows(csv_array)
 437
 438        timeFinish = time.clock()
 439
 440        # Print some stats
 441        print "\n[[ Total files scanned: %i ]]" % (fileCount)
 442        print "[[ Total files ignored: %i ]]" % (fileIgnoreCount)
 443        print "[[ Scan Time: %f seconds ]]" % (timeFinish - timeStart)
 444
 445        # Print top rank lists
 446        rank_list = {}
 447        for test in tests:
 448                test.sort()
 449                test.printer(10)
 450                for file in test.results:
 451                        rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
 452
 453        rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
 454
 455        print "\n[[ Top cumulative ranked files ]]"
 456        count = 10
 457        if (count > len(rank_sorted)): count = len(rank_sorted)
 458        for x in range(count):
 459                print ' {0:>7}          {1}'.format(rank_sorted[x][1], rank_sorted[x][0])