arch/x86/opt/x86_branch.kupo

   1 (*
   2    Branch reduction/prediction on the x86 code.
   3    Copyright (C) 2001 Justin David Smith, Caltech
   4
   5    This program is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU General Public License
   7    as published by the Free Software Foundation; either version 2
   8    of the License, or (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18  *)
  19 open Format
  20 open Symbol
  21 open Flags
  22
  23 open Fir_state
  24
  25 open Frame_type
  26
  27 open X86_inst_type
  28 open X86_frame_type
  29 open X86_opt_util
  30 open X86_frame
  31 open X86_arch
  32
  33
  34 (*
  35    Beginning KUPO escape
  36  *)
  37 inline_kupo:
  38 begin_ocaml_code`
  39
  40
  41 (*
  42    WARNING
  43    This optimization should be performed before blocks are merged
  44  *)
  45
  46
  47 (*
  48    This code removes branches in situations where it can easily identify
  49    them as simply setting the value of a variable.  This comes up most
  50    often in relational operators which set a boolean variable and then
  51    continue on to common code.  Since conditional jumps on Intel are not
  52    cheap (they can be very expensive when mispredicted), we want to re-
  53    write these statements using SETcc whenever possible.  That is the
  54    goal of this optimization.
  55  *)
  56
  57
  58 (* nontrivial_code
  59    Extracts comments from real code, and returns a block of comments
  60    and a list of non-comment instructions.  This is useful for getting
  61    the comments out of our way for awhile while we try to figure out
  62    what the block is actually doing.  *)
  63 let rec nontrivial_code = function
  64    inst :: insts when trivial_inst inst ->
  65       let comm, insts = nontrivial_code insts in
  66          inst :: comm, insts
  67  | inst :: insts ->
  68       let comm, insts = nontrivial_code insts in
  69          comm, inst :: insts
  70  | [] ->
  71       [], []
  72
  73
  74 (* code_set
  75    Returns something if the nontrivial code in a block simply sets
  76    a register and then jumps to another block.  The syntax is to
  77    return Some(comments, register set, register value, destination).
  78    If the block does not match this pattern, then None is returned.  *)
  79 let code_set code =
  80    match nontrivial_code code with
  81       comm, [MOV (IL, dst, src); JMP (ImmediateLabel label)] ->
  82          Some(comm, dst, src, label)
  83     | _ ->
  84          None
  85
  86
  87 (* code_jcc
  88    Returns something if this block ends in a conditional jump to
  89    another block (followed by an unconditional jump - the fail case).
  90    If this is the case, then we return Some with the comments, code
  91    preceding the jump, the operator used in JCC, and the labels for
  92    the success and failure cases.  Because we assume the JCC is
  93    followed by an unconditional jump, we need to run this code before
  94    the blocks are merged over in x86_trace.ml.  *)
  95 let code_jcc code =
  96    let comm, insts = nontrivial_code code in
  97    let rec code_jcc prev = function
  98       [JCC(op, ImmediateLabel lsucc); JMP(ImmediateLabel lfail)] ->
  99          Some(comm, List.rev prev, op, lsucc, lfail)
 100     | inst :: insts ->
 101          code_jcc (inst :: prev) insts
 102     | [] ->
 103          None
 104    in
 105       code_jcc [] insts
 106
 107
 108 (* branch_block
 109    Determines if this block is a block containing a conditional, in
 110    which both branches set a common variable and then return to run
 111    common code.  If such a case is found, it will be rewritten and
 112    optimized to use SETcc instead.  *)
 113 let branch_block benv block =
 114    let code = block.block_code in
 115    match code_jcc code with
 116       None ->
 117          block
 118     | Some(comm, prev, op, lsucc, lfail) ->
 119          try
 120             let bsucc = SymbolTable.find benv lsucc in
 121             let bfail = SymbolTable.find benv lfail in
 122             match code_set bsucc.block_code, code_set bfail.block_code with
 123                Some(cs, ds, ImmediateNumber (OffNumber ss), ls), Some(cf, df, ImmediateNumber (OffNumber sf), lf)
 124                when ds = df && ls = lf ->
 125                   (* This is the SET optimization.  Note that the generated
 126                      code causes a partial register stall; scheduling should
 127                      try to resolve this.  *)
 128                   let insts =
 129                      `new:
 130                         dcl   op, ds, lbl ls, $ss, $sf;
 131                         def   diffvalues = sf - ss;
 132                         def   minvalue = ss;
 133
 134                         (* Emit the various comments that got snipped out *)
 135                         comm_string "OPT:  Branch SET optimization";
 136                         comm_string `{ sprintf "OPT:  Success %s, set value %ld" (string_of_symbol lsucc) ss `};
 137                         comm_string `{ sprintf "OPT:  Failure %s, set value %ld" (string_of_symbol lfail) sf `};
 138                         add_list `{ comm `};
 139                         comm_string "OPT:  Comments from success branch";
 140                         add_list `{ cs `};
 141                         comm_string "OPT:  Comments from failure branch";
 142                         add_list `{ cf `};
 143
 144                         (* This is the actual setCC optimization.  We use
 145                            ECX here because we must force a byte-register. *)
 146                         movl  ecx,  $0;
 147                         add_list `{ prev `};
 148                         set   op,   ecx;
 149                         decl  ecx;
 150                         andl  ecx,  diffvalues;
 151                         addl  ecx,  minvalue;
 152                         movl  ds,   ecx;        (* Move result to destination ds *)
 153                         jmp   ls;`              (* The failure label is eliminated *)
 154                   in
 155                   let code = Listbuf.to_list insts in
 156                   (* Remove the conditional blocks we originally jumped to
 157                      from our list of possible jump destinations. *)
 158                   let jumps = block.block_jumps in
 159                   let jumps = List.filter (fun v -> not (Symbol.eq v lsucc)) jumps in
 160                   let jumps = List.filter (fun v -> not (Symbol.eq v lfail)) jumps in
 161                   let jumps = ls :: jumps in
 162                      { block with block_code = code;
 163                                   block_jumps = jumps }
 164              | _ ->
 165                   (* No optimization performed *)
 166                   block
 167          with
 168             Not_found -> block
 169
 170
 171 (* remove_dead_blocks
 172    After the branch optimization, we may have blocks sitting around
 173    that are no longer accessable.  These need to be removed both
 174    for efficiency, and to avoid problems during dataflow analysis.
 175    This function takes the blocks as a list, not as a trace.  *)
 176 let remove_dead_blocks export preserve blocks =
 177    let add_jumps targets block =
 178       let jumps = block.block_jumps in
 179       let add_jump targets j = SymbolTable.add targets j () in
 180          List.fold_left add_jump targets jumps
 181    in
 182    let targets = List.fold_left add_jumps SymbolTable.empty blocks in
 183    let block_target block =
 184       SymbolTable.mem export block.block_label
 185       || SymbolSet.mem preserve block.block_label
 186       || block.block_index <> None
 187       || SymbolTable.mem targets block.block_label
 188    in
 189    let blocks = List.filter block_target blocks in
 190       blocks
 191
 192
 193 (* branch_prog
 194    Apply the branch optimization to a program.  *)
 195 let init_info =
 196    { export_name = "init";
 197      export_is_fun = true
 198    }
 199
 200 let main_info =
 201    { export_name = "main";
 202      export_is_fun = true
 203    }
 204
 205 let branch_prog prog =
 206    (* Get the program blocks, apply branch optimization to blocks *)
 207    let blocks = prog.asm_blocks in
 208    let benv = Trace.fold (fun benv block ->
 209       SymbolTable.add benv block.block_label block) SymbolTable.empty blocks in
 210    let blocks =
 211       Trace.fold (fun blocks block ->
 212          branch_block benv block :: blocks) [] blocks
 213    in
 214
 215    (* Remove any newly-dead blocks from the program *)
 216    let export =
 217       List.fold_left (fun export (_, init_sym, main_sym) ->
 218          let export = SymbolTable.add export init_sym init_info in
 219          let export = SymbolTable.add export main_sym main_info in
 220             export) prog.asm_export prog.asm_init
 221    in
 222    let preserve = prog.asm_preserve in
 223    let blocks = remove_dead_blocks export preserve blocks in
 224    let blocks = Trace.of_list (List.rev blocks) in
 225       { prog with asm_blocks = blocks }
 226
 227 let branch_prog = profile "X86_branch.branch_prog" branch_prog
 228
 229 let branch_prog prog =
 230    if optimize_asm_level "opt.x86.branch" 2 then
 231       branch_prog prog
 232    else
 233       prog
 234
 235 let () = std_flags_register_list_help "opt.x86"
 236   ["opt.x86.branch",    FlagBool true,
 237                         "Enable specialized X86 branch instruction optimization." ^
 238                         " This introduces a significant performance improvement," ^
 239                         " and is recommended."]
 240
 241
 242 `end_ocaml_code