neopi.py

   1 #!/usr/bin/python
   2 # Name: neopi.py
   3 # Description: Utility to scan a file path for encrypted and obfuscated files
   4 # Authors: Ben Hagen (ben.hagen@neohapsis.com)
   5 #         Scott Behrens (scott.behrens@neohapsis.com)
   6 #
   7 # Date: 11/4/2010
   8 #
   9 # pep-0008 - Is stupid. TABS FO'EVER!
  10
  11 # Try catch regular expressions/bad path/bad filename/bad regex/
  12
  13 # Library imports
  14 import math
  15 import sys
  16 import os
  17 import re
  18 import csv
  19 import zlib
  20 import time
  21 from collections import defaultdict
  22 from optparse import OptionParser
  23
  24 class LanguageIC:
  25    """Class that calculates a file's Index of Coincidence as
  26    as well as a a subset of files average Index of Coincidence.
  27    """
  28    def __init__(self):
  29        """Initialize results arrays as well as character counters."""
  30        self.char_count =  defaultdict(int)
  31        self.total_char_count = 0
  32        self.results = []
  33        self.ic_total_results = ""
  34
  35    def calculate_char_count(self,data):
  36        """Method to calculate character counts for a particular data file."""
  37        if not data:
  38            return 0
  39        for x in range(256):
  40            char = chr(x)
  41            charcount = data.count(char)
  42            self.char_count[char] += charcount
  43            self.total_char_count += charcount
  44        return
  45
  46    def calculate_IC(self):
  47        """Calculate the Index of Coincidence for the self variables"""
  48        total = 0
  49        for val in self.char_count.values():
  50
  51            if val == 0:
  52                continue
  53            total += val * (val-1)
  54
  55        try:
  56            ic_total =      float(total)/(self.total_char_count * (self.total_char_count - 1))
  57        except:
  58            ic_total = 0
  59        self.ic_total_results = ic_total
  60        return
  61
  62    def calculate(self,data,filename):
  63        """Calculate the Index of Coincidence for a file and append to self.ic_results array"""
  64        if not data:
  65            return 0
  66        char_count = 0
  67        total_char_count = 0
  68
  69        for x in range(256):
  70            char = chr(x)
  71            charcount = data.count(char)
  72            char_count += charcount * (charcount - 1)
  73            total_char_count += charcount
  74
  75        ic = float(char_count)/(total_char_count * (total_char_count - 1))
  76        self.results.append({"filename":filename, "value":ic})
  77        # Call method to calculate_char_count and append to total_char_count
  78        self.calculate_char_count(data)
  79        return ic
  80
  81    def sort(self):
  82        self.results.sort(key=lambda item: item["value"])
  83        self.results = resultsAddRank(self.results)
  84
  85    def printer(self, count):
  86        """Print the top signature count match files for a given search"""
  87        # Calculate the Total IC for a Search
  88        self.calculate_IC()
  89        print "\n[[ Average IC for Search ]]"
  90        print self.ic_total_results
  91        print "\n[[ Top %i lowest IC files ]]" % (count)
  92        if (count > len(self.results)): count = len(self.results)
  93        for x in range(count):
  94            print ' {0:>7.4f}        {1}'.format(self.results[x]["value"], self.results[x]["filename"])
  95        return
  96
  97 class Entropy:
  98    """Class that calculates a file's Entropy."""
  99
 100    def __init__(self):
 101        """Instantiate the entropy_results array."""
 102        self.results = []
 103
 104    def calculate(self,data,filename):
 105        """Calculate the entropy for 'data' and append result to entropy_results array."""
 106
 107        if not data:
 108            return 0
 109        entropy = 0
 110        self.stripped_data =data.replace(' ', '')
 111        for x in range(256):
 112            p_x = float(self.stripped_data.count(chr(x)))/len(self.stripped_data)
 113            if p_x > 0:
 114                entropy += - p_x * math.log(p_x, 2)
 115        self.results.append({"filename":filename, "value":entropy})
 116        return entropy
 117
 118    def sort(self):
 119        self.results.sort(key=lambda item: item["value"])
 120        self.results.reverse()
 121        self.results = resultsAddRank(self.results)
 122
 123    def printer(self, count):
 124        """Print the top signature count match files for a given search"""
 125        print "\n[[ Top %i entropic files for a given search ]]" % (count)
 126        if (count > len(self.results)): count = len(self.results)
 127        for x in range(count):
 128            print ' {0:>7.4f}        {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 129        return
 130
 131 class LongestWord:
 132    """Class that determines the longest word for a particular file."""
 133    def __init__(self):
 134        """Instantiate the longestword_results array."""
 135        self.results = []
 136
 137    def calculate(self,data,filename):
 138        """Find the longest word in a string and append to longestword_results array"""
 139        if not data:
 140            return "", 0
 141        longest = 0
 142        longest_word = ""
 143        words = re.split("[\s,\n,\r]", data)
 144        if words:
 145            for word in words:
 146                length = len(word)
 147                if length > longest:
 148                    longest = length
 149                    longest_word = word
 150        self.results.append({"filename":filename, "value":longest})
 151        return longest
 152
 153    def sort(self):
 154        self.results.sort(key=lambda item: item["value"])
 155        self.results.reverse()
 156        self.results = resultsAddRank(self.results)
 157
 158    def printer(self, count):
 159        """Print the top signature count match files for a given search"""
 160        print "\n[[ Top %i longest word files ]]" % (count)
 161        if (count > len(self.results)): count = len(self.results)
 162        for x in range(count):
 163            print ' {0:>7}        {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 164        return
 165
 166 class SignatureNasty:
 167    """Generator that searches a given file for nasty expressions"""
 168
 169    def __init__(self):
 170        """Instantiate the results array."""
 171        self.results = []
 172
 173    def calculate(self, data, filename):
 174        if not data:
 175            return "", 0
 176        # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
 177        valid_regex = re.compile('(eval\(|file_put_contents|base64_decode|python_eval|exec\(|passthru|popen|proc_open|pcntl|assert\(|system\(|shell)', re.I)
 178        matches = re.findall(valid_regex, data)
 179        self.results.append({"filename":filename, "value":len(matches)})
 180        return len(matches)
 181
 182    def sort(self):
 183        self.results.sort(key=lambda item: item["value"])
 184        self.results.reverse()
 185        self.results = resultsAddRank(self.results)
 186
 187    def printer(self, count):
 188        """Print the top signature count match files for a given search"""
 189        print "\n[[ Top %i signature match counts ]]" % (count)
 190        if (count > len(self.results)): count = len(self.results)
 191        for x in range(count):
 192            print ' {0:>7}        {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 193        return
 194
 195
 196 class UsesEval:
 197    """Generator that searches a given file for nasty eval with variable"""
 198
 199    def __init__(self):
 200       """Instantiate the eval_results array."""
 201       self.results = []
 202
 203    def calculate(self, data, filename):
 204       if not data:
 205          return "", 0
 206       # Lots taken from the wonderful post at http://stackoverflow.com/questions/3115559/exploitable-php-functions
 207       valid_regex = re.compile('(eval\(\$(\w|\d))', re.I)
 208       matches = re.findall(valid_regex, data)
 209       self.results.append({"filename":filename, "value":len(matches)})
 210       return len(matches)
 211
 212    def sort(self):
 213       self.results.sort(key=lambda item: item["value"])
 214       self.results.reverse()
 215       self.results = resultsAddRank(self.results)
 216
 217    def printer(self, count):
 218       """Print the files that use eval"""
 219       print "\n[[ Top %i eval match counts ]]" % (count)
 220       if (count > len(self.results)): count = len(self.results)
 221       for x in range(count):
 222          print ' {0:>7}          {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 223       return
 224
 225
 226 class Compression:
 227    """Generator finds compression ratio"""
 228
 229    def __init__(self):
 230        """Instantiate the results array."""
 231        self.results = []
 232
 233    def calculate(self, data, filename):
 234        if not data:
 235            return "", 0
 236        compressed = zlib.compress(data)
 237        ratio = float(len(compressed)) / float(len(data))
 238        self.results.append({"filename":filename, "value":ratio})
 239        return ratio
 240
 241    def sort(self):
 242        self.results.sort(key=lambda item: item["value"])
 243        self.results.reverse()
 244        self.results = resultsAddRank(self.results)
 245
 246    def printer(self, count):
 247        """Print the top files for a given search"""
 248        print "\n[[ Top %i compression match counts ]]" % (count)
 249        if (count > len(self.results)): count = len(self.results)
 250        for x in range(count):
 251            print ' {0:>7.4f}        {1}'.format(self.results[x]["value"], self.results[x]["filename"])
 252        return
 253
 254 def resultsAddRank(results):
 255    rank = 1
 256    offset = 1
 257    previousValue = False
 258    newList = []
 259    for file in results:
 260        if (previousValue and previousValue != file["value"]):
 261            rank = offset
 262        file["rank"] = rank
 263        newList.append(file)
 264        previousValue = file["value"]
 265        offset = offset + 1
 266    return newList
 267
 268 class SearchFile:
 269    """Generator that searches a given filepath with an optional regular
 270    expression and returns the filepath and filename"""
 271    def search_file_path(self, args, valid_regex):
 272        for root, dirs, files in os.walk(args[0]):
 273            for file in files:
 274                filename = os.path.join(root, file)
 275                if (valid_regex.search(file) and os.path.getsize(filename) > 60):
 276                    try:
 277                        data = open(root + "/" + file, 'rb').read()
 278                    except:
 279                        data = False
 280                        print "Could not read file :: %s/%s" % (root, file)
 281                    yield data, filename
 282
 283 if __name__ == "__main__":
 284    """Parse all the options"""
 285
 286    timeStart = time.clock()
 287
 288    print """
 289        )         (   (
 290     ( /(         )\ ))\ )
 291     )\())  (    (()/(()/(
 292    ((_)\  ))\ (  /(_))(_))
 293     _((_)/((_))\(_))(_))
 294    | \| (_)) ((_) _ \_ _|
 295    | .` / -_) _ \  _/| |
 296    |_|\_\___\___/_| |___| Ver. *.USEGIT
 297    """
 298
 299    parser = OptionParser(usage="usage: %prog [options] <start directory> <OPTIONAL: filename regex>",
 300                          version="%prog 1.0")
 301    parser.add_option("-c", "--csv",
 302                      action="store",
 303                      dest="is_csv",
 304                      default=False,
 305                      help="generate CSV outfile",
 306                      metavar="FILECSV")
 307    parser.add_option("-a", "--all",
 308                      action="store_true",
 309                      dest="is_all",
 310                      default=False,
 311                      help="Run all (useful) tests [Entropy, Longest Word, IC, Signature]",)
 312    parser.add_option("-z", "--zlib",
 313                      action="store_true",
 314                      dest="is_zlib",
 315                      default=False,
 316                      help="Run compression Test",)
 317    parser.add_option("-e", "--entropy",
 318                      action="store_true",
 319                      dest="is_entropy",
 320                      default=False,
 321                      help="Run entropy Test",)
 322    parser.add_option("-E", "--eval",
 323                      action="store_true",
 324                      dest="is_eval",
 325                      default=False,
 326                      help="Run signiture test for the eval",)
 327    parser.add_option("-l", "--longestword",
 328                      action="store_true",
 329                      dest="is_longest",
 330                      default=False,
 331                      help="Run longest word test",)
 332    parser.add_option("-i", "--ic",
 333                      action="store_true",
 334                      dest="is_ic",
 335                      default=False,
 336                      help="Run IC test",)
 337    parser.add_option("-s", "--signature",
 338                      action="store_true",
 339                      dest="is_signature",
 340                      default=False,
 341                      help="Run signature test",)
 342    parser.add_option("-A", "--auto",
 343                      action="store_true",
 344                      dest="is_auto",
 345                      default=False,
 346                      help="Run auto file extension tests",)
 347    parser.add_option("-u", "--unicode",
 348                      action="store_true",
 349                      dest="ignore_unicode",
 350                      default=False,
 351                      help="Skip over unicode-y/UTF'y files",)
 352
 353    (options, args) = parser.parse_args()
 354
 355    # Error on invalid number of arguements
 356    if len(args) < 1:
 357        parser.print_help()
 358        print ""
 359        sys.exit()
 360
 361    # Error on an invalid path
 362    if os.path.exists(args[0]) == False:
 363        parser.error("Invalid path")
 364
 365    valid_regex = ""
 366    if (len(args) == 2 and options.is_auto is False):
 367        try:
 368            valid_regex = re.compile(args[1])
 369        except:
 370            parser.error("Invalid regular expression")
 371    else:
 372        valid_regex = re.compile('.*')
 373    tests = []
 374
 375    if options.is_auto:
 376        valid_regex = re.compile('(\.php|\.asp|\.aspx|\.scath|\.bash|\.zsh|\.csh|\.tsch|\.pl|\.py|\.txt|\.cgi|\.cfm|\.htaccess)$')
 377
 378    if options.is_all:
 379        tests.append(LanguageIC())
 380        tests.append(Entropy())
 381        tests.append(LongestWord())
 382        tests.append(SignatureNasty())
 383    else:
 384        if options.is_entropy:
 385            tests.append(Entropy())
 386        if options.is_longest:
 387            tests.append(LongestWord())
 388        if options.is_ic:
 389            tests.append(LanguageIC())
 390        if options.is_signature:
 391            tests.append(SignatureNasty())
 392        if options.is_eval:
 393            tests.append(UsesEval())
 394        if options.is_zlib:
 395            tests.append(Compression())
 396
 397    # Instantiate the Generator Class used for searching, opening, and reading files
 398    locator = SearchFile()
 399
 400    # CSV file output array
 401    csv_array = []
 402    csv_header = ["filename"]
 403
 404    # Grab the file and calculate each test against file
 405    fileCount = 0
 406    fileIgnoreCount = 0
 407    for data, filename in locator.search_file_path(args, valid_regex):
 408        if data:
 409            # a row array for the CSV
 410            csv_row = []
 411            csv_row.append(filename)
 412
 413            if options.ignore_unicode:
 414                asciiHighCount = 0
 415                for character in data:
 416                    if ord(character) > 127:
 417                        asciiHighCount = asciiHighCount + 1
 418
 419                fileAsciiHighRatio = float(asciiHighCount) / float(len(data))
 420
 421            if (options.ignore_unicode == False or fileAsciiHighRatio < .1):
 422                for test in tests:
 423                    calculated_value = test.calculate(data, filename)
 424                    # Make the header row if it hasn't been fully populated, +1 here to account for filename column
 425                    if len(csv_header) < len(tests) + 1:
 426                        csv_header.append(test.__class__.__name__)
 427                        csv_row.append(calculated_value)
 428                    fileCount = fileCount + 1
 429                    csv_array.append(csv_row)
 430            else:
 431                fileIgnoreCount = fileIgnoreCount + 1
 432
 433    if options.is_csv:
 434        csv_array.insert(0,csv_header)
 435        fileOutput = csv.writer(open(options.is_csv, "wb"))
 436        fileOutput.writerows(csv_array)
 437
 438    timeFinish = time.clock()
 439
 440    # Print some stats
 441    print "\n[[ Total files scanned: %i ]]" % (fileCount)
 442    print "[[ Total files ignored: %i ]]" % (fileIgnoreCount)
 443    print "[[ Scan Time: %f seconds ]]" % (timeFinish - timeStart)
 444
 445    # Print top rank lists
 446    rank_list = {}
 447    for test in tests:
 448        test.sort()
 449        test.printer(10)
 450        for file in test.results:
 451            rank_list[file["filename"]] = rank_list.setdefault(file["filename"], 0) + file["rank"]
 452
 453    rank_sorted = sorted(rank_list.items(), key=lambda x: x[1])
 454
 455    print "\n[[ Top cumulative ranked files ]]"
 456    count = 10
 457    if (count > len(rank_sorted)): count = len(rank_sorted)
 458    for x in range(count):
 459        print ' {0:>7}        {1}'.format(rank_sorted[x][1], rank_sorted[x][0])
 460