test/CodeGen/X86/misched-balance.ll

   1 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-machineinstrs | FileCheck %s
   2 ;
   3 ; Verify that misched resource/latency balancy heuristics are sane.
   4
   5 define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
   6   i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
   7  i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
   8   nounwind uwtable ssp {
   9 entry:
  10   br label %for.body
  11
  12 ; imull folded loads should be in order and interleaved with addl, never
  13 ; adjacent. Also check that we have no spilling.
  14 ;
  15 ; Since mmult1 IR is already in good order, this effectively ensure
  16 ; the scheduler maintains source order.
  17 ;
  18 ; CHECK-LABEL: %for.body
  19 ; CHECK-NOT: %rsp
  20 ; CHECK: imull 4
  21 ; CHECK-NOT: {{imull|rsp}}
  22 ; CHECK: addl
  23 ; CHECK: imull 8
  24 ; CHECK-NOT: {{imull|rsp}}
  25 ; CHECK: addl
  26 ; CHECK: imull 12
  27 ; CHECK-NOT: {{imull|rsp}}
  28 ; CHECK: addl
  29 ; CHECK: imull 16
  30 ; CHECK-NOT: {{imull|rsp}}
  31 ; CHECK: addl
  32 ; CHECK: imull 20
  33 ; CHECK-NOT: {{imull|rsp}}
  34 ; CHECK: addl
  35 ; CHECK: imull 24
  36 ; CHECK-NOT: {{imull|rsp}}
  37 ; CHECK: addl
  38 ; CHECK: imull 28
  39 ; CHECK-NOT: {{imull|rsp}}
  40 ; CHECK: addl
  41 ; CHECK: imull 32
  42 ; CHECK-NOT: {{imull|rsp}}
  43 ; CHECK: addl
  44 ; CHECK: imull 36
  45 ; CHECK-NOT: {{imull|rsp}}
  46 ; CHECK: addl
  47 ; CHECK-NOT: {{imull|rsp}}
  48 ; CHECK-LABEL: %end
  49 for.body:
  50   %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
  51   %tmp57 = load i32, i32* %tmp56, align 4
  52   %arrayidx12.us.i61 = getelementptr inbounds i32, i32* %pre, i64 %indvars.iv42.i
  53   %tmp58 = load i32, i32* %arrayidx12.us.i61, align 4
  54   %mul.us.i = mul nsw i32 %tmp58, %tmp57
  55   %arrayidx8.us.i.1 = getelementptr inbounds i32, i32* %tmp56, i64 1
  56   %tmp59 = load i32, i32* %arrayidx8.us.i.1, align 4
  57   %arrayidx12.us.i61.1 = getelementptr inbounds i32, i32* %pre94, i64 %indvars.iv42.i
  58   %tmp60 = load i32, i32* %arrayidx12.us.i61.1, align 4
  59   %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
  60   %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
  61   %arrayidx8.us.i.2 = getelementptr inbounds i32, i32* %tmp56, i64 2
  62   %tmp61 = load i32, i32* %arrayidx8.us.i.2, align 4
  63   %arrayidx12.us.i61.2 = getelementptr inbounds i32, i32* %pre95, i64 %indvars.iv42.i
  64   %tmp62 = load i32, i32* %arrayidx12.us.i61.2, align 4
  65   %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
  66   %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
  67   %arrayidx8.us.i.3 = getelementptr inbounds i32, i32* %tmp56, i64 3
  68   %tmp63 = load i32, i32* %arrayidx8.us.i.3, align 4
  69   %arrayidx12.us.i61.3 = getelementptr inbounds i32, i32* %pre96, i64 %indvars.iv42.i
  70   %tmp64 = load i32, i32* %arrayidx12.us.i61.3, align 4
  71   %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
  72   %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
  73   %arrayidx8.us.i.4 = getelementptr inbounds i32, i32* %tmp56, i64 4
  74   %tmp65 = load i32, i32* %arrayidx8.us.i.4, align 4
  75   %arrayidx12.us.i61.4 = getelementptr inbounds i32, i32* %pre97, i64 %indvars.iv42.i
  76   %tmp66 = load i32, i32* %arrayidx12.us.i61.4, align 4
  77   %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
  78   %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
  79   %arrayidx8.us.i.5 = getelementptr inbounds i32, i32* %tmp56, i64 5
  80   %tmp67 = load i32, i32* %arrayidx8.us.i.5, align 4
  81   %arrayidx12.us.i61.5 = getelementptr inbounds i32, i32* %pre98, i64 %indvars.iv42.i
  82   %tmp68 = load i32, i32* %arrayidx12.us.i61.5, align 4
  83   %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
  84   %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
  85   %arrayidx8.us.i.6 = getelementptr inbounds i32, i32* %tmp56, i64 6
  86   %tmp69 = load i32, i32* %arrayidx8.us.i.6, align 4
  87   %arrayidx12.us.i61.6 = getelementptr inbounds i32, i32* %pre99, i64 %indvars.iv42.i
  88   %tmp70 = load i32, i32* %arrayidx12.us.i61.6, align 4
  89   %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
  90   %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
  91   %arrayidx8.us.i.7 = getelementptr inbounds i32, i32* %tmp56, i64 7
  92   %tmp71 = load i32, i32* %arrayidx8.us.i.7, align 4
  93   %arrayidx12.us.i61.7 = getelementptr inbounds i32, i32* %pre100, i64 %indvars.iv42.i
  94   %tmp72 = load i32, i32* %arrayidx12.us.i61.7, align 4
  95   %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
  96   %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
  97   %arrayidx8.us.i.8 = getelementptr inbounds i32, i32* %tmp56, i64 8
  98   %tmp73 = load i32, i32* %arrayidx8.us.i.8, align 4
  99   %arrayidx12.us.i61.8 = getelementptr inbounds i32, i32* %pre101, i64 %indvars.iv42.i
 100   %tmp74 = load i32, i32* %arrayidx12.us.i61.8, align 4
 101   %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
 102   %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
 103   %arrayidx8.us.i.9 = getelementptr inbounds i32, i32* %tmp56, i64 9
 104   %tmp75 = load i32, i32* %arrayidx8.us.i.9, align 4
 105   %arrayidx12.us.i61.9 = getelementptr inbounds i32, i32* %pre102, i64 %indvars.iv42.i
 106   %tmp76 = load i32, i32* %arrayidx12.us.i61.9, align 4
 107   %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
 108   %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
 109   %arrayidx16.us.i = getelementptr inbounds i32, i32* %tmp55, i64 %indvars.iv42.i
 110   store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
 111   %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
 112   %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
 113   %exitcond = icmp eq i32 %lftr.wideiv, 10
 114   br i1 %exitcond, label %end, label %for.body
 115
 116 end:
 117   ret void
 118 }
 119
 120 ; Unlike the above loop, this IR starts out bad and must be
 121 ; rescheduled.
 122 ;
 123 ; CHECK-LABEL: %for.body
 124 ; CHECK-NOT: %rsp
 125 ; CHECK: imull 4
 126 ; CHECK-NOT: {{imull|rsp}}
 127 ; CHECK: addl
 128 ; CHECK: imull 8
 129 ; CHECK-NOT: {{imull|rsp}}
 130 ; CHECK: addl
 131 ; CHECK: imull 12
 132 ; CHECK-NOT: {{imull|rsp}}
 133 ; CHECK: addl
 134 ; CHECK: imull 16
 135 ; CHECK-NOT: {{imull|rsp}}
 136 ; CHECK: addl
 137 ; CHECK: imull 20
 138 ; CHECK-NOT: {{imull|rsp}}
 139 ; CHECK: addl
 140 ; CHECK: imull 24
 141 ; CHECK-NOT: {{imull|rsp}}
 142 ; CHECK: addl
 143 ; CHECK: imull 28
 144 ; CHECK-NOT: {{imull|rsp}}
 145 ; CHECK: addl
 146 ; CHECK: imull 32
 147 ; CHECK-NOT: {{imull|rsp}}
 148 ; CHECK: addl
 149 ; CHECK: imull 36
 150 ; CHECK-NOT: {{imull|rsp}}
 151 ; CHECK: addl
 152 ; CHECK-NOT: {{imull|rsp}}
 153 ; CHECK-LABEL: %end
 154 define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
 155   i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
 156   i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
 157   nounwind uwtable ssp {
 158 entry:
 159   br label %for.body
 160 for.body:
 161   %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
 162   %tmp57 = load i32, i32* %tmp56, align 4
 163   %arrayidx12.us.i61 = getelementptr inbounds i32, i32* %pre, i64 %indvars.iv42.i
 164   %tmp58 = load i32, i32* %arrayidx12.us.i61, align 4
 165   %arrayidx8.us.i.1 = getelementptr inbounds i32, i32* %tmp56, i64 1
 166   %tmp59 = load i32, i32* %arrayidx8.us.i.1, align 4
 167   %arrayidx12.us.i61.1 = getelementptr inbounds i32, i32* %pre94, i64 %indvars.iv42.i
 168   %tmp60 = load i32, i32* %arrayidx12.us.i61.1, align 4
 169   %arrayidx8.us.i.2 = getelementptr inbounds i32, i32* %tmp56, i64 2
 170   %tmp61 = load i32, i32* %arrayidx8.us.i.2, align 4
 171   %arrayidx12.us.i61.2 = getelementptr inbounds i32, i32* %pre95, i64 %indvars.iv42.i
 172   %tmp62 = load i32, i32* %arrayidx12.us.i61.2, align 4
 173   %arrayidx8.us.i.3 = getelementptr inbounds i32, i32* %tmp56, i64 3
 174   %tmp63 = load i32, i32* %arrayidx8.us.i.3, align 4
 175   %arrayidx12.us.i61.3 = getelementptr inbounds i32, i32* %pre96, i64 %indvars.iv42.i
 176   %tmp64 = load i32, i32* %arrayidx12.us.i61.3, align 4
 177   %arrayidx8.us.i.4 = getelementptr inbounds i32, i32* %tmp56, i64 4
 178   %tmp65 = load i32, i32* %arrayidx8.us.i.4, align 4
 179   %arrayidx12.us.i61.4 = getelementptr inbounds i32, i32* %pre97, i64 %indvars.iv42.i
 180   %tmp66 = load i32, i32* %arrayidx12.us.i61.4, align 4
 181   %arrayidx8.us.i.5 = getelementptr inbounds i32, i32* %tmp56, i64 5
 182   %tmp67 = load i32, i32* %arrayidx8.us.i.5, align 4
 183   %arrayidx12.us.i61.5 = getelementptr inbounds i32, i32* %pre98, i64 %indvars.iv42.i
 184   %tmp68 = load i32, i32* %arrayidx12.us.i61.5, align 4
 185   %arrayidx8.us.i.6 = getelementptr inbounds i32, i32* %tmp56, i64 6
 186   %tmp69 = load i32, i32* %arrayidx8.us.i.6, align 4
 187   %arrayidx12.us.i61.6 = getelementptr inbounds i32, i32* %pre99, i64 %indvars.iv42.i
 188   %tmp70 = load i32, i32* %arrayidx12.us.i61.6, align 4
 189   %mul.us.i = mul nsw i32 %tmp58, %tmp57
 190   %arrayidx8.us.i.7 = getelementptr inbounds i32, i32* %tmp56, i64 7
 191   %tmp71 = load i32, i32* %arrayidx8.us.i.7, align 4
 192   %arrayidx12.us.i61.7 = getelementptr inbounds i32, i32* %pre100, i64 %indvars.iv42.i
 193   %tmp72 = load i32, i32* %arrayidx12.us.i61.7, align 4
 194   %arrayidx8.us.i.8 = getelementptr inbounds i32, i32* %tmp56, i64 8
 195   %tmp73 = load i32, i32* %arrayidx8.us.i.8, align 4
 196   %arrayidx12.us.i61.8 = getelementptr inbounds i32, i32* %pre101, i64 %indvars.iv42.i
 197   %tmp74 = load i32, i32* %arrayidx12.us.i61.8, align 4
 198   %arrayidx8.us.i.9 = getelementptr inbounds i32, i32* %tmp56, i64 9
 199   %tmp75 = load i32, i32* %arrayidx8.us.i.9, align 4
 200   %arrayidx12.us.i61.9 = getelementptr inbounds i32, i32* %pre102, i64 %indvars.iv42.i
 201   %tmp76 = load i32, i32* %arrayidx12.us.i61.9, align 4
 202   %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
 203   %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
 204   %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
 205   %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
 206   %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
 207   %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
 208   %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
 209   %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
 210   %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
 211   %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
 212   %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
 213   %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
 214   %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
 215   %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
 216   %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
 217   %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
 218   %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
 219   %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
 220   %arrayidx16.us.i = getelementptr inbounds i32, i32* %tmp55, i64 %indvars.iv42.i
 221   store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
 222   %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
 223   %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
 224   %exitcond = icmp eq i32 %lftr.wideiv, 10
 225   br i1 %exitcond, label %end, label %for.body
 226
 227 end:
 228   ret void
 229 }
 230
 231 ; A mildly interesting little block extracted from a cipher.  The
 232 ; balanced heuristics are interesting here because we have resource,
 233 ; latency, and register limits all at once. For now, simply check that
 234 ; we don't use any callee-saves.
 235 ; CHECK-LABEL: @encpc1
 236 ; CHECK-LABEL: %entry
 237 ; CHECK-NOT: push
 238 ; CHECK-NOT: pop
 239 ; CHECK: ret
 240 @a = external global i32, align 4
 241 @b = external global i32, align 4
 242 @c = external global i32, align 4
 243 @d = external global i32, align 4
 244 define i32 @encpc1() nounwind {
 245 entry:
 246   %l1 = load i32, i32* @a, align 16
 247   %conv = shl i32 %l1, 8
 248   %s5 = lshr i32 %l1, 8
 249   %add = or i32 %conv, %s5
 250   store i32 %add, i32* @b
 251   %l6 = load i32, i32* @a
 252   %l7 = load i32, i32* @c
 253   %add.i = add i32 %l7, %l6
 254   %idxprom.i = zext i32 %l7 to i64
 255   %arrayidx.i = getelementptr inbounds i32, i32* @d, i64 %idxprom.i
 256   %l8 = load i32, i32* %arrayidx.i
 257   store i32 346, i32* @c
 258   store i32 20021, i32* @d
 259   %l9 = load i32, i32* @a
 260   store i32 %l8, i32* @a
 261   store i32 %l9, i32* @b
 262   store i32 %add.i, i32* @c
 263   store i32 %l9, i32* @d
 264   %cmp.i = icmp eq i32 %add.i, 0
 265   %s10 = lshr i32 %l1, 16
 266   %s12 = lshr i32 %l1, 24
 267   %s14 = lshr i32 %l1, 30
 268   br i1 %cmp.i, label %if, label %return
 269 if:
 270   %sa = add i32 %s5, %s10
 271   %sb = add i32 %sa, %s12
 272   %sc = add i32 %sb, %s14
 273   br label %return
 274 return:
 275   %result = phi i32 [0, %entry], [%sc, %if]
 276   ret i32 %result
 277 }