neopi.py

   1 #!/usr/bin/python
   2 # Name: neopi.py
   3 # Description: Utility to scan a file path for encrypted and obfuscated files
   4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
   5 #         Scott Behrens (scott.behrens@neohapsis.com)
   6 #
   7 # Date: 11/4/2010
   8 #
   9 # pep-0008 - Is stupid. TABS FO'EVER!  too bad, spaces are back!
  10
  11
  12
  13
  14 # Try catch regular expressions/bad path/bad filename/bad regex/
  15
  16 # Library imports
  17 import math
  18 import sys
  19 import os
  20 import re
  21 import csv
  22 import zlib
  23 import time
  24 from collections import defaultdict
  25 from optparse import OptionParser
  26
  27 class LanguageIC:
  28    """Class that calculates a file's Index of Coincidence as
  29    as well as a a subset of files average Index of Coincidence.
  30    """
  31    def __init__(self):
  32        """Initialize results arrays as well as character counters."""
  33        self.char_count =  defaultdict(int)
  34        self.total_char_count = 0
  35        self.results = []
  36        self.ic_total_results = ""
  37
  38    def calculate_char_count(self,data):
  39        """Method to calculate character counts for a particular data file."""
  40        if not data:
  41            return 0
  42        for x in range(256):
  43            char = chr(x)
  44            charcount = data.count(char)
  45            self.char_count[char] += charcount
  46            self.total_char_count += charcount
  47        return
  48
  49    def calculate_IC(self):
  50        """Calculate the Index of Coincidence for the self variables"""
  51        total = 0
  52        for val in self.char_count.values():
  53
  54            if val == 0:
  55                continue
  56            total += val * (val-1)
  57
  58        try:
  59            ic_total =      float(total)/(self.total_char_count * (self.total_char_count - 1))
  60        except:
  61            ic_total = 0
  62        self.ic_total_results = ic_total
  63        return
  64
  65    def calculate(self,data,filename):
  66        """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
  67        if not data:
  68            return 0
  69        char_count = 0
  70        total_char_count = 0
  71
  72        for x in range(256):
  73            char = chr(x)
  74            charcount = data.count(char)
  75            char_count += charcount * (charcount - 1)
  76            total_char_count += charcount
  77
  78        ic = float(char_count)/(total_char_count * (total_char_count - 1))
  79        self.results.append({"filename":filename, "value":ic})
  80        # Call method to calculate_char_count and append to total_char_count
  81        self.calculate_char_count(data)
  82        return ic
  83
  84    def sort(self):
  85        self.results.sort(key=lambda item: item["value"])
  86        self.results = resultsAddRank(self.results)
  87
  88    def printer(self, count):
  89        """Print the top signature count match files for a given search"""
  90        # Calculate the Total IC for a Search
  91        self.calculate_IC()
  92        print "\n[[ Average IC for Search ]]"
  93        print self.ic_total_results
  94        print "\n[[ Top %i lowest IC files ]]" % (count)
  95        if (count > len(self.results)): count = len(self.results)
  96        for x in range(count):
  97            print ' {0:>7.4f}        {1}'.format(self.results[x]["value"], self.results[x]["filename"])
  98        return
  99
 100 class Entropy:
 101    """Class that calculates a file's Entropy."""
 102
 103    def __init__(self):
 104        """Instantiate the entropy_results array."""
 105        self.results = []
 106
 107    def calculate(self,data,filename):
 108        """Calculate the entropy for 'data' and append result to entropy_results array."""
 109
 110        if not data:
 111            return 0
 112        entropy = 0
 113        self.stripped_data =data.replace(' ', '')
 114        for x in range(256):
 115            p_x = float(self.stripped_data.count(chr(x)))/len(self.stripped_data)
 116            if p_x > 0:
 117                entropy += - p_x * math.log(p_x, 2)
 118        self.results.append({"filename":filename, "value":entropy})
 119        return entropy
 120
 121    def sort(self):
 122        self.results.sort(key=lambda item: item["value"])
 123        self.results.reverse()
 124        self.results = resultsAddRank(self.results)
 125
 126    def printer(self, count):
 127        """Print the top signature count match files for a given search"""
 128        print "\n[[ Top %i entropic files for a given search ]]" % (count)
 129        if (count > len(self.results)): count = len(self.results)
 130        for x in range(count):
 131            print ' {0:>7.4f}        {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 132        return
 133
 134 class LongestWord:
 135    """Class that determines the longest word for a particular file."""
 136    def __init__(self):
 137        """Instantiate the longestword_results array."""
 138        self.results = []
 139
 140    def calculate(self,data,filename):
 141        """Find the longest word in a string and append to longestword_results array"""
 142        if not data:
 143            return "", 0
 144        longest = 0
 145        longest_word = ""
 146        words = re.split("[\s,\n,\r]", data)
 147        if words:
 148            for word in words:
 149                length = len(word)
 150                if length > longest:
 151                    longest = length
 152                    longest_word = word
 153        self.results.append({"filename":filename, "value":longest})
 154        return longest
 155
 156    def sort(self):
 157        self.results.sort(key=lambda item: item["value"])
 158        self.results.reverse()
 159        self.results = resultsAddRank(self.results)
 160
 161    def printer(self, count):
 162        """Print the top signature count match files for a given search"""
 163        print "\n[[ Top %i longest word files ]]" % (count)
 164        if (count > len(self.results)): count = len(self.results)
 165        for x in range(count):
 166            print ' {0:>7}        {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 167        return
 168
 169 class SignatureNasty:
 170    """Generator that searches a given file for nasty expressions"""
 171
 172    def __init__(self):
 173        """Instantiate the results array."""
 174        self.results = []
 175
 176    def calculate(self, data, filename):
 177        if not data:
 178            return "", 0
 179        # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
 180        valid_regex = re.compile('(eval\(|file_put_contents|base64_decode|python_eval|exec\(|passthru|popen|proc_open|pcntl|assert\(|system\(|shell)', re.I)
 181        matches = re.findall(valid_regex, data)
 182        self.results.append({"filename":filename, "value":len(matches)})
 183        return len(matches)
 184
 185    def sort(self):
 186        self.results.sort(key=lambda item: item["value"])
 187        self.results.reverse()
 188        self.results = resultsAddRank(self.results)
 189
 190    def printer(self, count):
 191        """Print the top signature count match files for a given search"""
 192        print "\n[[ Top %i signature match counts ]]" % (count)
 193        if (count > len(self.results)): count = len(self.results)
 194        for x in range(count):
 195            print ' {0:>7}        {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 196        return
 197
 198
 199 class UsesEval:
 200        """Generator that searches a given file for nasty eval with variable"""
 201
 202        def __init__(self):
 203                """Instantiate the eval_results array."""
 204                self.results = []
 205
 206        def calculate(self, data, filename):
 207                if not data:
 208                        return "", 0
 209                # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
 210                valid_regex = re.compile('(eval\(\$(\w|\d))', re.I)
 211                matches = re.findall(valid_regex, data)
 212                self.results.append({"filename":filename, "value":len(matches)})
 213                return len(matches)
 214
 215        def sort(self):
 216                self.results.sort(key=lambda item: item["value"])
 217                self.results.reverse()
 218                self.results = resultsAddRank(self.results)
 219
 220        def printer(self, count):
 221                """Print the files that use eval"""
 222                print "\n[[ Top %i eval match counts ]]" % (count)
 223                if (count > len(self.results)): count = len(self.results)
 224                for x in range(count):
 225                        print ' {0:>7}          {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 226                return
 227
 228
 229 class Compression:
 230    """Generator finds compression ratio"""
 231
 232    def __init__(self):
 233        """Instantiate the results array."""
 234        self.results = []
 235
 236    def calculate(self, data, filename):
 237        if not data:
 238            return "", 0
 239        compressed = zlib.compress(data)
 240        ratio = float(len(compressed)) / float(len(data))
 241        self.results.append({"filename":filename, "value":ratio})
 242        return ratio
 243
 244    def sort(self):
 245        self.results.sort(key=lambda item: item["value"])
 246        self.results.reverse()
 247        self.results = resultsAddRank(self.results)
 248
 249    def printer(self, count):
 250        """Print the top files for a given search"""
 251        print "\n[[ Top %i compression match counts ]]" % (count)
 252        if (count > len(self.results)): count = len(self.results)
 253        for x in range(count):
 254            print ' {0:>7.4f}        {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 255        return
 256
 257 def resultsAddRank(results):
 258    rank = 1
 259    offset = 1
 260    previousValue = False
 261    newList = []
 262    for file in results:
 263        if (previousValue and previousValue != file["value"]):
 264            rank = offset
 265        file["rank"] = rank
 266        newList.append(file)
 267        previousValue = file["value"]
 268        offset = offset + 1
 269    return newList
 270
 271 class SearchFile:
 272    """Generator that searches a given filepath with an optional regular
 273    expression and returns the filepath and filename"""
 274    def search_file_path(self, args, valid_regex):
 275        for root, dirs, files in os.walk(args[0]):
 276            for file in files:
 277                filename = os.path.join(root, file)
 278                if (valid_regex.search(file) and os.path.getsize(filename) > 60):
 279                    try:
 280                        data = open(root + "/" + file, 'rb').read()
 281                    except:
 282                        data = False
 283                        print "Could not read file :: %s/%s" % (root, file)
 284                    yield data, filename
 285
 286 if __name__ == "__main__":
 287    """Parse all the options"""
 288
 289    timeStart = time.clock()
 290
 291    print """
 292        )         (   (
 293     ( /(         )\ ))\ )
 294     )\())  (    (()/(()/(
 295    ((_)\  ))\ (  /(_))(_))
 296     _((_)/((_))\(_))(_))
 297    | \| (_)) ((_) _ \_ _|
 298    | .` / -_) _ \  _/| |
 299    |_|\_\___\___/_| |___| Ver. *.USEGIT
 300    """
 301
 302    parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
 303                          version="%prog 1.0")
 304    parser.add_option("-c", "--csv",
 305                      action="store",
 306                      dest="is_csv",
 307                      default=False,
 308                      help="generate CSV outfile",
 309                      metavar="FILECSV")
 310    parser.add_option("-a", "--all",
 311                      action="store_true",
 312                      dest="is_all",
 313                      default=False,
 314                      help="Run all (useful) tests [Entropy, Longest Word, IC, Signature]",)
 315    parser.add_option("-z", "--zlib",
 316                      action="store_true",
 317                      dest="is_zlib",
 318                      default=False,
 319                      help="Run compression Test",)
 320    parser.add_option("-e", "--entropy",
 321                      action="store_true",
 322                      dest="is_entropy",
 323                      default=False,
 324                      help="Run entropy Test",)
 325    parser.add_option("-E", "--eval",
 326                      action="store_true",
 327                      dest="is_eval",
 328                      default=False,
 329                      help="Run signiture test for the eval",)
 330    parser.add_option("-l", "--longestword",
 331                      action="store_true",
 332                      dest="is_longest",
 333                      default=False,
 334                      help="Run longest word test",)
 335    parser.add_option("-i", "--ic",
 336                      action="store_true",
 337                      dest="is_ic",
 338                      default=False,
 339                      help="Run IC test",)
 340    parser.add_option("-s", "--signature",
 341                      action="store_true",
 342                      dest="is_signature",
 343                      default=False,
 344                      help="Run signature test",)
 345    parser.add_option("-A", "--auto",
 346                      action="store_true",
 347                      dest="is_auto",
 348                      default=False,
 349                      help="Run auto file extension tests",)
 350    parser.add_option("-u", "--unicode",
 351                      action="store_true",
 352                      dest="ignore_unicode",
 353                      default=False,
 354                      help="Skip over unicode-y/UTF'y files",)
 355
 356    (options, args) = parser.parse_args()
 357
 358    # Error on invalid number of arguements
 359    if len(args) < 1:
 360        parser.print_help()
 361        print ""
 362        sys.exit()
 363
 364    # Error on an invalid path
 365    if os.path.exists(args[0]) == False:
 366        parser.error("Invalid path")
 367
 368    valid_regex = ""
 369    if (len(args) == 2 and options.is_auto is False):
 370        try:
 371            valid_regex = re.compile(args[1])
 372        except:
 373            parser.error("Invalid regular expression")
 374    else:
 375        valid_regex = re.compile('.*')
 376    tests = []
 377
 378    if options.is_auto:
 379        valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm|\.htaccess)$')
 380
 381    if options.is_all:
 382        tests.append(LanguageIC())
 383        tests.append(Entropy())
 384        tests.append(LongestWord())
 385        tests.append(SignatureNasty())
 386    else:
 387        if options.is_entropy:
 388            tests.append(Entropy())
 389        if options.is_longest:
 390            tests.append(LongestWord())
 391        if options.is_ic:
 392            tests.append(LanguageIC())
 393        if options.is_signature:
 394            tests.append(SignatureNasty())
 395        if options.is_eval:
 396            tests.append(UsesEval())
 397        if options.is_zlib:
 398            tests.append(Compression())
 399
 400    # Instantiate the Generator Class used for searching, opening, and reading files
 401    locator = SearchFile()
 402
 403    # CSV file output array
 404    csv_array = []
 405    csv_header = ["filename"]
 406
 407    # Grab the file and calculate each test against file
 408    fileCount = 0
 409    fileIgnoreCount = 0
 410    for data, filename in locator.search_file_path(args, valid_regex):
 411        if data:
 412            # a row array for the CSV
 413            csv_row = []
 414            csv_row.append(filename)
 415
 416            if options.ignore_unicode:
 417                asciiHighCount = 0
 418                for character in data:
 419                    if ord(character) > 127:
 420                        asciiHighCount = asciiHighCount + 1
 421
 422                fileAsciiHighRatio = float(asciiHighCount) / float(len(data))
 423
 424            if (options.ignore_unicode == False or fileAsciiHighRatio < .1):
 425                for test in tests:
 426                    calculated_value = test.calculate(data, filename)
 427                    # Make the header row if it hasn't been fully populated, +1 here to account for filename column
 428                    if len(csv_header) < len(tests) + 1:
 429                        csv_header.append(test.__class__.__name__)
 430                        csv_row.append(calculated_value)
 431                    fileCount = fileCount + 1
 432                    csv_array.append(csv_row)
 433            else:
 434                fileIgnoreCount = fileIgnoreCount + 1
 435
 436    if options.is_csv:
 437        csv_array.insert(0,csv_header)
 438        fileOutput = csv.writer(open(options.is_csv, "wb"))
 439        fileOutput.writerows(csv_array)
 440
 441    timeFinish = time.clock()
 442
 443    # Print some stats
 444    print "\n[[ Total files scanned: %i ]]" % (fileCount)
 445    print "[[ Total files ignored: %i ]]" % (fileIgnoreCount)
 446    print "[[ Scan Time: %f seconds ]]" % (timeFinish - timeStart)
 447
 448    # Print top rank lists
 449    rank_list = {}
 450    for test in tests:
 451        test.sort()
 452        test.printer(10)
 453        for file in test.results:
 454            rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
 455
 456    rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
 457
 458    print "\n[[ Top cumulative ranked files ]]"
 459    count = 10
 460    if (count > len(rank_sorted)): count = len(rank_sorted)
 461    for x in range(count):
 462        print ' {0:>7}        {1}'.format(rank_sorted[x][1], rank_sorted[x][0])